In [44]:
import re
import json
from collections import defaultdict
import difflib
import textwrap
import os

import sys
from rapidfuzz import process
import pandas as pd

### 1. Clean transcripts

In [45]:
# Predefined canonical names
target_names = [
    'Ellen Osborne', 'David Oberg', 'Graham Paige', 'Jonno Alcaro',
    'Katrina Callsen', 'Kate Acuff', 'Judy Le'
]

# Known aliases → canonical name
alias_map = {
    "Katherin Acuff": "Kate Acuff",
    "Katherine Acuff": "Kate Acuff",
    "Kate Acuff": "Kate Acuff",
    "Jonathan Alcaro": "Jonno Alcaro",
    "Jon Alcaro": "Jonno Alcaro",
    "Jonno Alcaro": "Jonno Alcaro"
}

# Merge all names used for matching
known_names = list(set(target_names + list(alias_map.keys())))

def normalize_name(name):
    # Remove extra characters, insert space before capital letters if needed
    name = re.sub(r'[^a-zA-Z ]+', '', name)  # remove punctuation
    name = re.sub(r'([a-z])([A-Z])', r'\1 \2', name)  # camelCase to spaced
    return name.strip().title()

def extract_names_from_transcript(text):
    pattern = re.compile(r"\[\d{2}:\d{2}:\d{2} --> \d{2}:\d{2}:\d{2}\]\s+([^\:]+):")
    return sorted(set(pattern.findall(text)))

def build_name_map_auto(name_variants, known_names, alias_map, threshold=75):
    name_map = {}
    for name in name_variants:
        normalized_name = normalize_name(name)

        # Alias check
        if normalized_name in alias_map:
            name_map[name] = alias_map[normalized_name]
            continue

        # Fuzzy match
        result = process.extractOne(normalized_name, known_names, score_cutoff=threshold)
        if result:
            match, _, _ = result
            canonical = alias_map.get(match, match)
            name_map[name] = canonical
        else:
            # Fallback
            name_map[name] = normalized_name
    return name_map

def normalize_transcript(text, name_map):
    pattern = re.compile(r"(\[\d{2}:\d{2}:\d{2} --> \d{2}:\d{2}:\d{2}\])\s+([^\:]+):")
    
    def replacer(match):
        timestamp = match.group(1)
        raw_name = match.group(2).strip()
        canonical = name_map.get(raw_name, raw_name.title())
        return f"{timestamp} {canonical}:"

    return pattern.sub(replacer, text)

def clean_transcript(transcript_path, out_path):
    with open(transcript_path, "r", encoding="utf-8") as f:
        text = f.read()

    detected_names = extract_names_from_transcript(text)
    name_map = build_name_map_auto(detected_names, known_names, alias_map)

    # Save name map for transparency
    with open("name_map.json", "w") as f:
        json.dump(name_map, f, indent=2)

    # Normalize transcript
    cleaned = normalize_transcript(text, name_map)

    with open(out_path, "w", encoding="utf-8") as f:
        f.write(cleaned)

    print(f"Cleaned transcript saved to: {out_path}")

In [46]:
transcript_path = '/work/users/s/m/smerrill/Albemarle/transcripts'
all_transcripts = os.listdir(transcript_path)
clean_transcript_path = '/work/users/s/m/smerrill/Albemarle/cleantranscripts'
os.makedirs(clean_transcript_path, exist_ok=True)

for transcript in all_transcripts:
    transcript_file = os.path.join(transcript_path, transcript)
    output_file = os.path.join(clean_transcript_path, transcript)
    clean_transcript(transcript_file, output_file)

Cleaned transcript saved to: /work/users/s/m/smerrill/Albemarle/cleantranscripts/-A6v9Byfz20.txt
Cleaned transcript saved to: /work/users/s/m/smerrill/Albemarle/cleantranscripts/3BtZN2Tye08.txt
Cleaned transcript saved to: /work/users/s/m/smerrill/Albemarle/cleantranscripts/5YMkxWBgdtY.txt
Cleaned transcript saved to: /work/users/s/m/smerrill/Albemarle/cleantranscripts/5hxY73VZXok.txt
Cleaned transcript saved to: /work/users/s/m/smerrill/Albemarle/cleantranscripts/6WYXzne6dlY.txt
Cleaned transcript saved to: /work/users/s/m/smerrill/Albemarle/cleantranscripts/6gwrzUFYMw8.txt
Cleaned transcript saved to: /work/users/s/m/smerrill/Albemarle/cleantranscripts/8TdTe--0CUs.txt
Cleaned transcript saved to: /work/users/s/m/smerrill/Albemarle/cleantranscripts/9zIGFJbKhNg.txt
Cleaned transcript saved to: /work/users/s/m/smerrill/Albemarle/cleantranscripts/BFr19jzSYoM.txt
Cleaned transcript saved to: /work/users/s/m/smerrill/Albemarle/cleantranscripts/HbBO4irR6AA.txt
Cleaned transcript saved to: /

### 2. Make Datasets

In [47]:
def convert_transcript_to_finetune_data(
    filename,
    target_speaker,
    match_threshold=0.8,
    max_tokens=200,
    max_context_utterances=5,
    min_response_words=3
):
    def split_long_response(response_text, max_tokens):
        # Approximate 1 token ≈ 5 characters
        chunks = textwrap.wrap(response_text, width=max_tokens * 5)
        return chunks

    with open(filename, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    speaker_map = {}
    speaker_id = 1
    dialogue = []
    known_speakers = set()

    # Step 1: Parse lines and normalize speakers
    for line in lines:
        match = re.match(r"\[\d{2}:\d{2}:\d{2} --> \d{2}:\d{2}:\d{2}\] (.*?):\s+(.*)", line)
        if not match:
            continue
        speaker, text = match.groups()
        speaker = speaker.strip().rstrip(':')
        text = text.strip()

        if not text:
            continue

        known_speakers.add(speaker)
        is_target = bool(difflib.get_close_matches(speaker, [target_speaker], n=1, cutoff=match_threshold))
        normalized_speaker = target_speaker if is_target else speaker_map.setdefault(
            speaker, f"Speaker_{speaker_id}"
        )

        if not is_target and speaker not in speaker_map:
            speaker_id += 1

        dialogue.append((normalized_speaker, text))

    # Step 2: Build fine-tuning examples
    examples = []
    context = []
    buffer = []

    for i, (speaker, text) in enumerate(dialogue):
        if speaker == target_speaker:
            buffer.append(text)
            next_speaker = dialogue[i + 1][0] if i + 1 < len(dialogue) else None
            if next_speaker != target_speaker:
                full_response = " ".join(buffer)
                response_chunks = split_long_response(full_response, max_tokens)
                trimmed_context = context[-max_context_utterances:]

                for chunk in response_chunks:
                    word_count = len(re.findall(r'\b\w+\b', chunk))
                    if word_count >= min_response_words and trimmed_context:
                        examples.append({
                            "messages": [
                                {"role": "user", "content": "\n".join(trimmed_context)},
                                {"role": "assistant", "content": chunk}
                            ]
                        })

                context.append(f"{speaker}: {full_response}")
                buffer = []
        else:
            context.append(f"{speaker}: {text}")

    return examples

def save_chat_dataset(data, output_path):
    """
    Save a list of chat-style message dictionaries to a JSONL or JSON file.
    """
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2)
    print(f"Dataset saved to: {output_path}")
    print(f"Total examples: {len(data)}")

def load_chat_dataset(input_path):
    """
    Load a chat-style message dataset from a JSON or JSONL file.
    
    Returns:
        data (list): A list of message dictionaries.
    """
    with open(input_path, "r", encoding="utf-8") as f:
        if input_path.endswith(".jsonl"):
            data = [json.loads(line) for line in f]
        else:  # Assume .json
            data = json.load(f)
    
    print(f"Dataset loaded from: {input_path}")
    print(f"Total examples: {len(data)}")
    return data

def count_context_and_content_words(dataset):
    """
    Analyze word statistics in 'user' (context) and 'assistant' (content) messages.

    Args:
        dataset (list): A list of dicts with 'messages' key, each containing role/content.

    Returns:
        dict: Word stats including total, average, min, and max for context and content.
    """
    context_word_counts = []
    content_word_counts = []

    for example in dataset:
        context_count = 0
        content_count = 0
        for msg in example.get("messages", []):
            words = re.findall(r'\b\w+\b', msg["content"])
            if msg["role"] == "user":
                context_count += len(words)
            elif msg["role"] == "assistant":
                content_count += len(words)
        context_word_counts.append(context_count)
        content_word_counts.append(content_count)

    total_examples = len(dataset)

    return {
        "total_context_words": sum(context_word_counts),
        "total_content_words": sum(content_word_counts),
        "avg_context_words": sum(context_word_counts) / total_examples if total_examples else 0,
        "avg_content_words": sum(content_word_counts) / total_examples if total_examples else 0,
        "min_context_words": min(context_word_counts) if context_word_counts else 0,
        "max_context_words": max(context_word_counts) if context_word_counts else 0,
        "min_content_words": min(content_word_counts) if content_word_counts else 0,
        "max_content_words": max(content_word_counts) if content_word_counts else 0
    }

In [48]:
dataset_save_path = '/work/users/s/m/smerrill/Albemarle/dataset/'

In [49]:
datasets = []
for speaker in target_names:
    print(speaker)
    examples = []
    for transcript in all_transcripts:
        cleaned_transcript = os.path.join(clean_transcript_path, transcript)
        examples += convert_transcript_to_finetune_data(
                            cleaned_transcript,
                            speaker,
                            match_threshold=0.8, # this should be exact now...
                            max_tokens=250,
                            max_context_utterances=5)
    save_name = dataset_save_path + speaker.replace(' ', '').lower() + '.txt'
    save_chat_dataset(examples, save_name)
    datasets.append(save_name)
    print('---------------------------------------')

Ellen Osborne
Dataset saved to: /work/users/s/m/smerrill/Albemarle/dataset/ellenosborne.txt
Total examples: 91
---------------------------------------
David Oberg
Dataset saved to: /work/users/s/m/smerrill/Albemarle/dataset/davidoberg.txt
Total examples: 262
---------------------------------------
Graham Paige
Dataset saved to: /work/users/s/m/smerrill/Albemarle/dataset/grahampaige.txt
Total examples: 767
---------------------------------------
Jonno Alcaro
Dataset saved to: /work/users/s/m/smerrill/Albemarle/dataset/jonnoalcaro.txt
Total examples: 351
---------------------------------------
Katrina Callsen
Dataset saved to: /work/users/s/m/smerrill/Albemarle/dataset/katrinacallsen.txt
Total examples: 345
---------------------------------------
Kate Acuff
Dataset saved to: /work/users/s/m/smerrill/Albemarle/dataset/kateacuff.txt
Total examples: 321
---------------------------------------
Judy Le
Dataset saved to: /work/users/s/m/smerrill/Albemarle/dataset/judyle.txt
Total examples: 226

### 3. Dataset Metastats

In [50]:
rows = []

for speaker in target_names:
    filename = dataset_save_path + speaker.replace(' ', '').lower() + '.txt'

    try:
        tmp = load_chat_dataset(filename)
        word_counts = count_context_and_content_words(tmp)

        rows.append({
            "speaker": speaker,
            "total_context_words": word_counts["total_context_words"],
            "total_content_words": word_counts["total_content_words"],
            "avg_context_words": word_counts["avg_context_words"],
            "avg_content_words": word_counts["avg_content_words"],
            "min_context_words": word_counts["min_context_words"],
            "max_context_words": word_counts["max_context_words"],
            "min_content_words": word_counts["min_content_words"],
            "max_content_words": word_counts["max_content_words"]
        })
    except FileNotFoundError:
        print(f"File not found: {filename}")
    except Exception as e:
        print(f"Error processing {filename}: {e}")

df = pd.DataFrame(rows)


Dataset loaded from: /work/users/s/m/smerrill/Albemarle/dataset/ellenosborne.txt
Total examples: 91
Dataset loaded from: /work/users/s/m/smerrill/Albemarle/dataset/davidoberg.txt
Total examples: 262
Dataset loaded from: /work/users/s/m/smerrill/Albemarle/dataset/grahampaige.txt
Total examples: 767
Dataset loaded from: /work/users/s/m/smerrill/Albemarle/dataset/jonnoalcaro.txt
Total examples: 351
Dataset loaded from: /work/users/s/m/smerrill/Albemarle/dataset/katrinacallsen.txt
Total examples: 345
Dataset loaded from: /work/users/s/m/smerrill/Albemarle/dataset/kateacuff.txt
Total examples: 321
Dataset loaded from: /work/users/s/m/smerrill/Albemarle/dataset/judyle.txt
Total examples: 226


In [52]:
df

Unnamed: 0,speaker,total_context_words,total_content_words,avg_context_words,avg_content_words,min_context_words,max_context_words,min_content_words,max_content_words
0,Ellen Osborne,3978,1917,43.714286,21.065934,10,192,3,206
1,David Oberg,16260,9752,62.061069,37.221374,10,371,3,270
2,Graham Paige,43744,21268,57.032595,27.728814,10,590,3,251
3,Jonno Alcaro,24554,14737,69.954416,41.985755,8,921,3,263
4,Katrina Callsen,25495,17090,73.898551,49.536232,12,435,3,255
5,Kate Acuff,19471,9360,60.657321,29.158879,10,292,3,243
6,Judy Le,14684,8299,64.973451,36.721239,10,419,3,247


### Synthetic Data

In [57]:
synth_path = '/nas/longleaf/home/smerrill/notebooks/LLM/data/final'

In [76]:
with open(os.path.join(synth_path, 'acuff_qa_pairs_ft.json'), 'r', encoding='utf-8') as f:
    acuff1 = json.load(f)

with open(os.path.join(synth_path, 'YT_acuff_qa_pairs_ft.json'), 'r', encoding='utf-8') as f:
    acuff2 = json.load(f)
acuff = acuff1 + acuff2

save_chat_dataset(acuff, dataset_save_path + 'synth_kateacuff.txt')

Dataset saved to: /work/users/s/m/smerrill/Albemarle/dataset/synth_kateacuff.txt
Total examples: 40


In [77]:
with open(os.path.join(synth_path, 'osborne_qa_pairs_ft.json'), 'r', encoding='utf-8') as f:
    osborne1 = json.load(f)

with open(os.path.join(synth_path, 'YT_osborne_qa_pairs_ft.json'), 'r', encoding='utf-8') as f:
    osborne2 = json.load(f)
osborne = osborne1 + osborne2

save_chat_dataset(osborne, dataset_save_path + 'synth_ellenosborne.txt')

Dataset saved to: /work/users/s/m/smerrill/Albemarle/dataset/synth_ellenosborne.txt
Total examples: 41


In [78]:
with open(os.path.join(synth_path, 'paige_qa_pairs_ft.json'), 'r', encoding='utf-8') as f:
    paige1 = json.load(f)

with open(os.path.join(synth_path, 'YT_paige_qa_pairs_ft.json'), 'r', encoding='utf-8') as f:
    paige2 = json.load(f)
paige = paige1 + paige2

save_chat_dataset(paige, dataset_save_path + 'synth_grahampaige.txt')

Dataset saved to: /work/users/s/m/smerrill/Albemarle/dataset/synth_grahampaige.txt
Total examples: 50


In [79]:
with open(os.path.join(synth_path, 'le_qa_pairs_ft.json'), 'r', encoding='utf-8') as f:
    le1 = json.load(f)

with open(os.path.join(synth_path, 'YT_le_qa_pairs_ft.json'), 'r', encoding='utf-8') as f:
    le2 = json.load(f)
le = le1 + le2

save_chat_dataset(le, dataset_save_path + 'synth_judyle.txt')

Dataset saved to: /work/users/s/m/smerrill/Albemarle/dataset/synth_judyle.txt
Total examples: 43


In [70]:
rows = [] 

In [73]:
word_counts = count_context_and_content_words(acuff)

rows.append({
    "speaker": 'Kate Acuff',
    "total_context_words": word_counts["total_context_words"],
    "total_content_words": word_counts["total_content_words"],
    "avg_context_words": word_counts["avg_context_words"],
    "avg_content_words": word_counts["avg_content_words"],
    "min_context_words": word_counts["min_context_words"],
    "max_context_words": word_counts["max_context_words"],
    "min_content_words": word_counts["min_content_words"],
    "max_content_words": word_counts["max_content_words"]
})

In [72]:
word_counts = count_context_and_content_words(osborne)

rows.append({
    "speaker": 'Ellen Osborne',
    "total_context_words": word_counts["total_context_words"],
    "total_content_words": word_counts["total_content_words"],
    "avg_context_words": word_counts["avg_context_words"],
    "avg_content_words": word_counts["avg_content_words"],
    "min_context_words": word_counts["min_context_words"],
    "max_context_words": word_counts["max_context_words"],
    "min_content_words": word_counts["min_content_words"],
    "max_content_words": word_counts["max_content_words"]
})

In [71]:
word_counts = count_context_and_content_words(paige)

rows.append({
    "speaker": 'Graham Paige',
    "total_context_words": word_counts["total_context_words"],
    "total_content_words": word_counts["total_content_words"],
    "avg_context_words": word_counts["avg_context_words"],
    "avg_content_words": word_counts["avg_content_words"],
    "min_context_words": word_counts["min_context_words"],
    "max_context_words": word_counts["max_context_words"],
    "min_content_words": word_counts["min_content_words"],
    "max_content_words": word_counts["max_content_words"]
})

In [70]:
word_counts = count_context_and_content_words(le)

rows.append({
    "speaker": 'Judy Le',
    "total_context_words": word_counts["total_context_words"],
    "total_content_words": word_counts["total_content_words"],
    "avg_context_words": word_counts["avg_context_words"],
    "avg_content_words": word_counts["avg_content_words"],
    "min_context_words": word_counts["min_context_words"],
    "max_context_words": word_counts["max_context_words"],
    "min_content_words": word_counts["min_content_words"],
    "max_content_words": word_counts["max_content_words"]
})

In [74]:
synthetic_df = pd.DataFrame(rows)

In [75]:
synthetic_df

Unnamed: 0,speaker,total_context_words,total_content_words,avg_context_words,avg_content_words,min_context_words,max_context_words,min_content_words,max_content_words
0,Judy Le,462,445,10.744186,10.348837,6,17,1,27
1,Graham Paige,529,354,10.58,7.08,4,16,1,17
2,Ellen Osborne,422,409,10.292683,9.97561,4,19,1,35
3,Kate Acuff,468,216,11.7,5.4,7,17,1,17
