In [53]:
import re
import json
from collections import defaultdict
import difflib
import textwrap

def convert_transcript_to_finetune_data(
    filename,
    target_speaker,
    match_threshold=0.8,
    max_tokens=200,
    max_context_utterances=5
):
    def split_long_response(response_text, max_tokens):
        # Approximate 1 token ≈ 5 characters
        chunks = textwrap.wrap(response_text, width=max_tokens * 5)
        return chunks

    with open(filename, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    speaker_map = {}
    speaker_id = 1
    dialogue = []
    known_speakers = set()

    # Step 1: Parse lines and normalize speakers
    for line in lines:
        match = re.match(r"\[\d{2}:\d{2}:\d{2} --> \d{2}:\d{2}:\d{2}\] (.*?):\s+(.*)", line)
        if not match:
            continue
        speaker, text = match.groups()
        speaker = speaker.strip().rstrip(':')
        text = text.strip()

        if not text:
            continue

        known_speakers.add(speaker)
        is_target = bool(difflib.get_close_matches(speaker, [target_speaker], n=1, cutoff=match_threshold))
        normalized_speaker = target_speaker if is_target else speaker_map.setdefault(
            speaker, f"Speaker_{speaker_id}"
        )

        if not is_target and speaker not in speaker_map:
            speaker_id += 1

        dialogue.append((normalized_speaker, text))

    # Step 2: Build fine-tuning examples
    examples = []
    context = []
    buffer = []

    for i, (speaker, text) in enumerate(dialogue):
        if speaker == target_speaker:
            buffer.append(text)
            next_speaker = dialogue[i + 1][0] if i + 1 < len(dialogue) else None
            if next_speaker != target_speaker:
                full_response = " ".join(buffer)
                response_chunks = split_long_response(full_response, max_tokens)

                trimmed_context = context[-max_context_utterances:]

                for chunk in response_chunks:
                    if trimmed_context:
                        examples.append({
                            "messages": [
                                {"role": "user", "content": "\n".join(trimmed_context)},
                                {"role": "assistant", "content": chunk}
                            ]
                        })

                context.append(f"{speaker}: {full_response}")
                buffer = []
        else:
            context.append(f"{speaker}: {text}")

    return examples


In [54]:
target_speaker = "Michael Leichlit"
data = convert_transcript_to_finetune_data('test.txt', target_speaker)

# Print or export the result
with open('output.jsonl', 'w', encoding='utf-8') as f_out:
    for item in data:
        f_out.write(json.dumps(item) + '\n')

print(f"{len(data)} training samples written to 'output.jsonl'")


19 training samples written to 'output.jsonl'


In [55]:
data[2]

{'messages': [{'role': 'user',
   'content': "Michael Leichlit: And I just March 8th. My name is Mike Whitwider. And I serve as the superintendent of the Harvard Unified Union School District. I am calling this meeting to order at 601. This is an organizational meeting following our town hall meetings of 2024. So I am calling to order the newly seated Harvard Unified Union School Board for the district. In attendance on Zoom, we have 12 board members. We have no one absent and we have two vacant seats from the town of Waterbury. And just so everyone knows earlier today, I was in contact with the town clerk for Waterbury. There were right in candidates for those two seats, but there was no one who attained the number of 30. So that will be an open, two open seats that we will advertise beginning on Monday. There's a whole process in our manual that we follow regarding the appointment of board members. They first have to we advertise it. The Waterbury Select Board will interview and make