In [59]:
import re
import json
from collections import defaultdict
import difflib
import textwrap
import os

import sys
from rapidfuzz import process

### 1. Clean transcripts

In [57]:
# Predefined canonical names
target_names = [
    'Ellen Osborne', 'David Oberg', 'Graham Paige', 'Jonno Alcaro',
    'Katrina Callsen', 'Kate Acuff', 'Judy Le'
]

# Known aliases → canonical name
alias_map = {
    "Katherin Acuff": "Kate Acuff",
    "Katherine Acuff": "Kate Acuff",
    "Kate Acuff": "Kate Acuff",
    "Jonathan Alcaro": "Jonno Alcaro",
    "Jon Alcaro": "Jonno Alcaro",
    "Jonno Alcaro": "Jonno Alcaro"
}

# Merge all names used for matching
known_names = list(set(target_names + list(alias_map.keys())))

def normalize_name(name):
    # Remove extra characters, insert space before capital letters if needed
    name = re.sub(r'[^a-zA-Z ]+', '', name)  # remove punctuation
    name = re.sub(r'([a-z])([A-Z])', r'\1 \2', name)  # camelCase to spaced
    return name.strip().title()

def extract_names_from_transcript(text):
    pattern = re.compile(r"\[\d{2}:\d{2}:\d{2} --> \d{2}:\d{2}:\d{2}\]\s+([^\:]+):")
    return sorted(set(pattern.findall(text)))

def build_name_map_auto(name_variants, known_names, alias_map, threshold=75):
    name_map = {}
    for name in name_variants:
        normalized_name = normalize_name(name)

        # Alias check
        if normalized_name in alias_map:
            name_map[name] = alias_map[normalized_name]
            continue

        # Fuzzy match
        result = process.extractOne(normalized_name, known_names, score_cutoff=threshold)
        if result:
            match, _, _ = result
            canonical = alias_map.get(match, match)
            name_map[name] = canonical
        else:
            # Fallback
            name_map[name] = normalized_name
    return name_map

def normalize_transcript(text, name_map):
    pattern = re.compile(r"(\[\d{2}:\d{2}:\d{2} --> \d{2}:\d{2}:\d{2}\])\s+([^\:]+):")
    
    def replacer(match):
        timestamp = match.group(1)
        raw_name = match.group(2).strip()
        canonical = name_map.get(raw_name, raw_name.title())
        return f"{timestamp} {canonical}:"

    return pattern.sub(replacer, text)

def clean_transcript(transcript_path, out_path):
    with open(transcript_path, "r", encoding="utf-8") as f:
        text = f.read()

    detected_names = extract_names_from_transcript(text)
    name_map = build_name_map_auto(detected_names, known_names, alias_map)

    # Save name map for transparency
    with open("name_map.json", "w") as f:
        json.dump(name_map, f, indent=2)

    # Normalize transcript
    cleaned = normalize_transcript(text, name_map)

    with open(out_path, "w", encoding="utf-8") as f:
        f.write(cleaned)

    print(f"Cleaned transcript saved to: {out_path}")

In [96]:
transcript_path = '/work/users/s/m/smerrill/Albemarle/transcriptsbkp'
clean_transcript_path = '/work/users/s/m/smerrill/Albemarle/cleantranscriptsbkp'

for transcript in all_transcripts:
    transcript_file = os.path.join(transcript_path, transcript)
    output_file = os.path.join(clean_transcript_path, transcript)
    clean_transcript(transcript_file, output_file)

Cleaned transcript saved to: /work/users/s/m/smerrill/Albemarle/cleantranscriptsbkp/-A6v9Byfz20.txt
Cleaned transcript saved to: /work/users/s/m/smerrill/Albemarle/cleantranscriptsbkp/5hxY73VZXok.txt
Cleaned transcript saved to: /work/users/s/m/smerrill/Albemarle/cleantranscriptsbkp/6WYXzne6dlY.txt
Cleaned transcript saved to: /work/users/s/m/smerrill/Albemarle/cleantranscriptsbkp/6gwrzUFYMw8.txt
Cleaned transcript saved to: /work/users/s/m/smerrill/Albemarle/cleantranscriptsbkp/9zIGFJbKhNg.txt
Cleaned transcript saved to: /work/users/s/m/smerrill/Albemarle/cleantranscriptsbkp/BFr19jzSYoM.txt
Cleaned transcript saved to: /work/users/s/m/smerrill/Albemarle/cleantranscriptsbkp/HbBO4irR6AA.txt
Cleaned transcript saved to: /work/users/s/m/smerrill/Albemarle/cleantranscriptsbkp/MgHvV_9dxdA.txt
Cleaned transcript saved to: /work/users/s/m/smerrill/Albemarle/cleantranscriptsbkp/NSizXN8dD5g.txt
Cleaned transcript saved to: /work/users/s/m/smerrill/Albemarle/cleantranscriptsbkp/PfwawD7a2Sg.txt


### 2. Make Datasets

In [109]:
def convert_transcript_to_finetune_data(
    filename,
    target_speaker,
    match_threshold=0.8,
    max_tokens=200,
    max_context_utterances=5
):
    def split_long_response(response_text, max_tokens):
        # Approximate 1 token ≈ 5 characters
        chunks = textwrap.wrap(response_text, width=max_tokens * 5)
        return chunks

    with open(filename, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    speaker_map = {}
    speaker_id = 1
    dialogue = []
    known_speakers = set()

    # Step 1: Parse lines and normalize speakers
    for line in lines:
        match = re.match(r"\[\d{2}:\d{2}:\d{2} --> \d{2}:\d{2}:\d{2}\] (.*?):\s+(.*)", line)
        if not match:
            continue
        speaker, text = match.groups()
        speaker = speaker.strip().rstrip(':')
        text = text.strip()

        if not text:
            continue

        known_speakers.add(speaker)
        is_target = bool(difflib.get_close_matches(speaker, [target_speaker], n=1, cutoff=match_threshold))
        normalized_speaker = target_speaker if is_target else speaker_map.setdefault(
            speaker, f"Speaker_{speaker_id}"
        )

        if not is_target and speaker not in speaker_map:
            speaker_id += 1

        dialogue.append((normalized_speaker, text))

    # Step 2: Build fine-tuning examples
    examples = []
    context = []
    buffer = []

    for i, (speaker, text) in enumerate(dialogue):
        if speaker == target_speaker:
            buffer.append(text)
            next_speaker = dialogue[i + 1][0] if i + 1 < len(dialogue) else None
            if next_speaker != target_speaker:
                full_response = " ".join(buffer)
                response_chunks = split_long_response(full_response, max_tokens)

                trimmed_context = context[-max_context_utterances:]

                for chunk in response_chunks:
                    if trimmed_context:
                        examples.append({
                            "messages": [
                                {"role": "user", "content": "\n".join(trimmed_context)},
                                {"role": "assistant", "content": chunk}
                            ]
                        })

                context.append(f"{speaker}: {full_response}")
                buffer = []
        else:
            context.append(f"{speaker}: {text}")

    return examples

def save_chat_dataset(data, output_path):
    """
    Save a list of chat-style message dictionaries to a JSONL or JSON file.
    """
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2)
    print(f"Dataset saved to: {output_path}")
    print(f"Total examples: {len(data)}")

def load_jsonl_dataset(file_path):
    data = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            data.append(json.loads(line.strip()))
    print(f"Loaded {len(data)} records from {file_path}")
    return data

def count_context_and_content_words(dataset):
    """
    Count total words in 'user' (context) and 'assistant' (content) messages.
    
    Args:
        dataset (list): A list of dicts with 'messages' key, each containing role/content.
    
    Returns:
        dict: Word counts {'context_words': int, 'content_words': int}
    """
    context_words = 0
    content_words = 0

    for example in dataset:
        for msg in example.get("messages", []):
            words = re.findall(r'\b\w+\b', msg["content"])
            if msg["role"] == "user":
                context_words += len(words)
            elif msg["role"] == "assistant":
                content_words += len(words)

    return {
        "context_words": context_words,
        "content_words": content_words
    }


In [100]:
dataset_save_path = '/work/users/s/m/smerrill/Albemarle/dataset/'

In [114]:
datasets = []
for speaker in target_names:
    print(speaker)
    examples = []
    for transcript in all_transcripts:
        cleaned_transcript = os.path.join(clean_transcript_path, transcript)
        examples += convert_transcript_to_finetune_data(
                            cleaned_transcript,
                            speaker,
                            match_threshold=0.8, # this should be exact now...
                            max_tokens=250,
                            max_context_utterances=5)
    save_name = dataset_save_path + speaker.replace(' ', '').lower() + '.txt'
    save_chat_dataset(examples, save_name)
    datasets.append(save_name)
    print('---------------------------------------')

Ellen Osborne
Dataset saved to: /work/users/s/m/smerrill/Albemarle/dataset/ellenosborne.txt
Total examples: 45
---------------------------------------
David Oberg
Dataset saved to: /work/users/s/m/smerrill/Albemarle/dataset/davidoberg.txt
Total examples: 179
---------------------------------------
Graham Paige
Dataset saved to: /work/users/s/m/smerrill/Albemarle/dataset/grahampaige.txt
Total examples: 281
---------------------------------------
Jonno Alcaro
Dataset saved to: /work/users/s/m/smerrill/Albemarle/dataset/jonnoalcaro.txt
Total examples: 227
---------------------------------------
Katrina Callsen
Dataset saved to: /work/users/s/m/smerrill/Albemarle/dataset/katrinacallsen.txt
Total examples: 220
---------------------------------------
Kate Acuff
Dataset saved to: /work/users/s/m/smerrill/Albemarle/dataset/kateacuff.txt
Total examples: 187
---------------------------------------
Judy Le
Dataset saved to: /work/users/s/m/smerrill/Albemarle/dataset/judyle.txt
Total examples: 118

### 3. Dataset Metastats

In [122]:
rows = []

for speaker in target_names:
    filename = dataset_save_path + speaker.replace(' ', '').lower() + '.txt'

    try:
        tmp = load_json_dataset(filename)
        word_counts = count_context_and_content_words(tmp)

        rows.append({
            "speaker": speaker,
            "context_words": word_counts["context_words"],
            "content_words": word_counts["content_words"]
        })
    except FileNotFoundError:
        print(f"⚠️ File not found: {filename}")
    except Exception as e:
        print(f"❌ Error processing {filename}: {e}")

df = pd.DataFrame(rows)

Loaded 45 records from /work/users/s/m/smerrill/Albemarle/dataset/ellenosborne.txt
Loaded 179 records from /work/users/s/m/smerrill/Albemarle/dataset/davidoberg.txt
Loaded 281 records from /work/users/s/m/smerrill/Albemarle/dataset/grahampaige.txt
Loaded 227 records from /work/users/s/m/smerrill/Albemarle/dataset/jonnoalcaro.txt
Loaded 220 records from /work/users/s/m/smerrill/Albemarle/dataset/katrinacallsen.txt
Loaded 187 records from /work/users/s/m/smerrill/Albemarle/dataset/kateacuff.txt
Loaded 118 records from /work/users/s/m/smerrill/Albemarle/dataset/judyle.txt


In [123]:
df

Unnamed: 0,speaker,context_words,content_words
0,Ellen Osborne,2296,1166
1,David Oberg,12156,7055
2,Graham Paige,19221,9311
3,Jonno Alcaro,16432,10072
4,Katrina Callsen,16889,14112
5,Kate Acuff,12920,6008
6,Judy Le,9222,6077
