In [3]:
import re
import json
from collections import defaultdict
import difflib
import textwrap
import os

import sys
from rapidfuzz import process
import pandas as pd

import matplotlib.pyplot as plt
import random
from typing import List, Tuple

### 1. Clean transcripts

In [56]:
# Predefined canonical names
target_names = [
    'Ellen Osborne', 'David Oberg', 'Graham Paige', 'Jonno Alcaro',
    'Katrina Callsen', 'Kate Acuff', 'Judy Le'
]

# Known aliases → canonical name
alias_map = {
    "Katherin Acuff": "Kate Acuff",
    "Katherine Acuff": "Kate Acuff",
    "Kate Acuff": "Kate Acuff",
    "Jonathan Alcaro": "Jonno Alcaro",
    "Jon Alcaro": "Jonno Alcaro",
    "Jonno Alcaro": "Jonno Alcaro"
}

# Merge all names used for matching
known_names = list(set(target_names + list(alias_map.keys())))

def normalize_name(name):
    # Remove extra characters, insert space before capital letters if needed
    name = re.sub(r'[^a-zA-Z ]+', '', name)  # remove punctuation
    name = re.sub(r'([a-z])([A-Z])', r'\1 \2', name)  # camelCase to spaced
    return name.strip().title()


def build_name_map(name_variants, known_names, alias_map, threshold=75):
    name_map = {}
    for name in name_variants:
        normalized_name = normalize_name(name)

        # Alias check
        if normalized_name in alias_map:
            name_map[name] = alias_map[normalized_name]
            continue

        # Fuzzy match
        result = process.extractOne(normalized_name, known_names, score_cutoff=threshold)
        if result:
            match, _, _ = result
            canonical = alias_map.get(match, match)
            name_map[name] = canonical
        else:
            # Fallback
            name_map[name] = normalized_name
    return name_map


def clean_transcripts(data):
    cleaned = []

    for entry in data:
        speaker = entry['speaker'].strip().lower()
        text = entry['text'].strip()

        # Normalize whitespace
        text = re.sub(r'\s+', ' ', text)

        # Count question marks and short Q&A-like sentences
        question_count = text.count('?')
        sentence_count = len(re.findall(r'\w+[^.?!]*[.?!]', text))

        # Rule 1: Too many Q&A pairs in one segment
        if question_count >= 2 and sentence_count >= 2:
            speaker = 'nospeaker'

        # Rule 2: Text too short
        elif len(text.split()) <= 10:
            speaker = 'nospeaker'

        # Rule 3: Multiple "?" or alternating Q&A markers in one line
        elif len(re.findall(r'\w+\?', text)) >= 2 and 'yes' in text.lower():
            speaker = 'nospeaker'

        # Rule 4: No alphabetic content
        elif not re.search(r'[a-zA-Z]', text):
            speaker = 'nospeaker'

        cleaned.append({'speaker': speaker, 'text': text})

    return cleaned

In [57]:
transcript_path = '/work/users/s/m/smerrill/Albemarle/transcripts'
all_transcripts = os.listdir(transcript_path)
clean_transcript_path = '/work/users/s/m/smerrill/Albemarle/cleantranscripts'
os.makedirs(clean_transcript_path, exist_ok=True)

for transcript in all_transcripts:
    transcript_file = os.path.join(transcript_path, transcript)
    text = np.load(transcript_file, allow_pickle=True)
    detected_names = set([ x['speaker'] for x in text])
    name_map = build_name_map(detected_names, known_names, alias_map)
    cleaned = clean_transcripts(text)
    np.save(os.path.join(clean_transcript_path, transcript), cleaned)

In [58]:
cleaned

[{'speaker': 'grahampaige',
  'text': 'All right, it is 630. I called this meeting of the Albemarle County School Board for September 24 to order. And coming out of our closed session, Mrs. Carlson, do you have a motion? I do.'},
 {'speaker': 'katrinacallsen',
  'text': 'Rachel Wainwright-van Kesselmann, Ph.D. : Chair page I move that the board certified by recorded vote that to the best of each board members knowledge only public business matters lawfully exempted from the open meeting requirements of the Virginia freedom of information act. Rachel Wainwright-van Kesselmann, Ph.D. : and identified in the motion authorizing the closed meeting were heard discussed are considered in the closed meeting.'},
 {'speaker': 'nospeaker', 'text': 'Second.'},
 {'speaker': 'nospeaker',
  'text': 'OK, it has been moved by Mrs. Colson and seconded by Ms. Del Caro that we certify that we only talked about what was in the original motion to go into closed session. Is there any discussion? OK, Ms. John

### 2. Make Datasets

In [144]:
import difflib

def build_chat_dataset(
    transcript,
    target_speaker,
    max_context=5,
    max_tokens_per_turn=2000,
    max_total_context_tokens=2000,
    match_threshold=0.7,
):
    """
    Convert a transcript into chat-style dataset for fine-tuning,
    with fuzzy speaker matching and context length control.

    Args:
        transcript (list of dict): Each dict has 'speaker' and 'text'.
        target_speaker (str): The target speaker to model (approximate match).
        max_context (int): Max previous utterances to include.
        max_tokens_per_turn (int): Max tokens per utterance (truncation).
        max_total_context_tokens (int): Max total tokens in context window.
        match_threshold (float): Fuzzy matching threshold (0.0 to 1.0).

    Returns:
        list of dicts: Each with 'conversation': [role/content dicts].
    """

    def estimate_tokens(text):
        # Rough estimate: 1 token ~ 0.75 words, so words count as tokens roughly
        return len(text.split())

    def truncate_text(text, max_tokens):
        words = text.split()
        if len(words) <= max_tokens:
            return text
        return " ".join(words[:max_tokens]) + " ..."

    dataset = []
    speaker_map = {}
    known_speakers = set()
    speaker_id = 1

    for i, entry in enumerate(transcript):
        raw_speaker = entry["speaker"]
        text = entry["text"].strip()

        if not text:
            continue

        known_speakers.add(raw_speaker)
        is_target = bool(
            difflib.get_close_matches(raw_speaker, [target_speaker], n=1, cutoff=match_threshold)
        )

        if is_target:
            normalized_speaker = target_speaker
        else:
            if raw_speaker not in speaker_map:
                speaker_map[raw_speaker] = f"Speaker_{speaker_id}"
                speaker_id += 1
            normalized_speaker = speaker_map[raw_speaker]

        if normalized_speaker != target_speaker:
            continue

        # Build context from previous utterances with token limits
        context = []
        total_tokens = 0
        j = i - 1
        context_turns = 0

        while j >= 0 and context_turns < max_context:
            prev = transcript[j]
            prev_text = prev["text"].strip()
            if not prev_text:
                j -= 1
                continue

            prev_raw_speaker = prev["speaker"]
            prev_is_target = bool(
                difflib.get_close_matches(prev_raw_speaker, [target_speaker], n=1, cutoff=match_threshold)
            )

            if prev_is_target:
                prev_norm = target_speaker
            else:
                if prev_raw_speaker not in speaker_map:
                    speaker_map[prev_raw_speaker] = f"Speaker_{speaker_id}"
                    speaker_id += 1
                prev_norm = speaker_map[prev_raw_speaker]

            # Truncate the previous text if too long
            truncated_prev_text = truncate_text(prev_text, max_tokens_per_turn)
            tokens_in_prev = estimate_tokens(truncated_prev_text)

            if total_tokens + tokens_in_prev > max_total_context_tokens:
                break

            context.append({"role": prev_norm, "content": truncated_prev_text})
            total_tokens += tokens_in_prev
            context_turns += 1
            j -= 1

        context.reverse()

        # Add the current target speaker message, truncated if needed
        truncated_current_text = truncate_text(text, max_tokens_per_turn)
        context.append({"role": target_speaker, "content": truncated_current_text})

        dataset.append({"conversation": context})

    return dataset


def convert_transcript_to_finetune_data(
    filename,
    target_speaker,
    match_threshold=0.8,
    max_tokens=200,
    max_context_utterances=5,
    min_response_words=3
):
    def split_long_response(response_text, max_tokens):
        # Approximate 1 token ≈ 5 characters
        chunks = textwrap.wrap(response_text, width=max_tokens * 5)
        return chunks

    lines = np.load(filename, allow_pickle=True)

    speaker_map = {}
    speaker_id = 1
    dialogue = []
    known_speakers = set()


    for line in lines:
        speaker = line['speaker']
        text = line['text']
        
        if not text:
            continue

        known_speakers.add(speaker)
        is_target = bool(difflib.get_close_matches(speaker, [target_speaker], n=1, cutoff=match_threshold))
        normalized_speaker = target_speaker if is_target else speaker_map.setdefault(
            speaker, f"Speaker_{speaker_id}"
        )

        if not is_target and speaker not in speaker_map:
            speaker_id += 1

        dialogue.append((normalized_speaker, text))

    # Step 2: Build fine-tuning examples
    examples = []
    context = []
    buffer = []

    for i, (speaker, text) in enumerate(dialogue):
        if speaker == target_speaker:
            buffer.append(text)
            next_speaker = dialogue[i + 1][0] if i + 1 < len(dialogue) else None
            if next_speaker != target_speaker:
                full_response = " ".join(buffer)
                response_chunks = split_long_response(full_response, max_tokens)
                trimmed_context = context[-max_context_utterances:]

                for chunk in response_chunks:
                    word_count = len(re.findall(r'\b\w+\b', chunk))
                    if word_count >= min_response_words and trimmed_context:
                        examples.append({
                            "messages": [
                                {"role": "user", "content": "\n".join(trimmed_context)},
                                {"role": "assistant", "content": chunk}
                            ]
                        })

                context.append(f"{speaker}: {full_response}")
                buffer = []
        else:
            context.append(f"{speaker}: {text}")

    return examples

def save_chat_dataset(data, output_path):
    """
    Save a list of chat-style message dictionaries to a JSONL or JSON file.
    """
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2)
    print(f"Dataset saved to: {output_path}")
    print(f"Total examples: {len(data)}")

def load_chat_dataset(input_path):
    """
    Load a chat-style message dataset from a JSON or JSONL file.
    
    Returns:
        data (list): A list of message dictionaries.
    """
    with open(input_path, "r", encoding="utf-8") as f:
        if input_path.endswith(".jsonl"):
            data = [json.loads(line) for line in f]
        else:  # Assume .json
            data = json.load(f)
    
    print(f"Dataset loaded from: {input_path}")
    print(f"Total examples: {len(data)}")
    return data

def count_context_and_content_words(dataset, target_speaker):
    """
    Analyze word statistics in context (non-target speaker) and content (target speaker) messages.

    Args:
        dataset (list): A list of dicts with either 'conversation' or 'messages' key, 
                        each containing dicts with 'role' and 'content'.
        target_speaker (str): The normalized name of the target speaker (e.g. "grahampaige").

    Returns:
        dict: Word stats including total, average, min, and max for context and content.
    """
    context_word_counts = []
    content_word_counts = []

    for example in dataset:
        context_count = 0
        content_count = 0

        messages = example.get("conversation") or example.get("messages") or []

        for msg in messages:
            content = msg.get("content", "")
            role = msg.get("role", "")
            words = re.findall(r'\b\w+\b', content)

            if role == target_speaker:
                content_count += len(words)
            else:
                context_count += len(words)

        context_word_counts.append(context_count)
        content_word_counts.append(content_count)

    total_examples = len(dataset)

    return {
        "total_context_words": sum(context_word_counts),
        "total_content_words": sum(content_word_counts),
        "avg_context_words": sum(context_word_counts) / total_examples if total_examples else 0,
        "avg_content_words": sum(content_word_counts) / total_examples if total_examples else 0,
        "min_context_words": min(context_word_counts) if context_word_counts else 0,
        "max_context_words": max(context_word_counts) if context_word_counts else 0,
        "min_content_words": min(content_word_counts) if content_word_counts else 0,
        "max_content_words": max(content_word_counts) if content_word_counts else 0,
    }


In [36]:
dataset_save_path = '/work/users/s/m/smerrill/Albemarle/dataset/'

In [108]:
datasets = []
for speaker in target_names:
    speaker = speaker.lower().replace(' ', '')
    print(speaker)
    examples = []
    for transcript in all_transcripts:
        cleaned_transcript = os.path.join(clean_transcript_path, transcript)
        examples += build_chat_dataset(np.load(cleaned_transcript, allow_pickle=True),
                                       speaker)
    save_name = dataset_save_path + speaker.replace(' ', '').lower() + '.txt'    
    save_chat_dataset(examples, save_name)
    datasets.append(save_name)
    print('---------------------------------------')

ellenosborne
Dataset saved to: /work/users/s/m/smerrill/Albemarle/dataset/ellenosborne.txt
Total examples: 68
---------------------------------------
davidoberg
Dataset saved to: /work/users/s/m/smerrill/Albemarle/dataset/davidoberg.txt
Total examples: 264
---------------------------------------
grahampaige
Dataset saved to: /work/users/s/m/smerrill/Albemarle/dataset/grahampaige.txt
Total examples: 896
---------------------------------------
jonnoalcaro
Dataset saved to: /work/users/s/m/smerrill/Albemarle/dataset/jonnoalcaro.txt
Total examples: 431
---------------------------------------
katrinacallsen
Dataset saved to: /work/users/s/m/smerrill/Albemarle/dataset/katrinacallsen.txt
Total examples: 486
---------------------------------------
kateacuff
Dataset saved to: /work/users/s/m/smerrill/Albemarle/dataset/kateacuff.txt
Total examples: 296
---------------------------------------
judyle
Dataset saved to: /work/users/s/m/smerrill/Albemarle/dataset/judyle.txt
Total examples: 160
------

In [176]:
tmp

[{'conversation': [{'role': 'Speaker_3',
    'content': 'I move that we approve the consent agenda.'},
   {'role': 'Speaker_3', 'content': 'Second.'},
   {'role': 'Speaker_3', 'content': 'OK, was that Mrs. Moore?'},
   {'role': 'Speaker_3', 'content': 'Osborne, yes.'},
   {'role': 'Speaker_1',
    'content': 'I mean, sorry about that, Ms. Osborne. All right, moved by Mr. Alcaro, seconded by Ms. Osborne that we approve the consent agenda. Any discussion?'},
   {'role': 'judyle',
    'content': "I did want to just bring up that I really appreciate that the policies are now, we're now seeing the equity checklist on each policy, and I know that we had initially asked for just a check, but I really appreciate the added information that we've been getting on those, because it definitely helps me know how we're processing those kinds of things, so I appreciate it."}]},
 {'conversation': [{'role': 'Speaker_24',
    'content': "The total number of educators hired is 124, which is about 60 fewer

### 3. Dataset Metastats

In [162]:
rows = []

for speaker in target_names:
    filename = dataset_save_path + speaker.replace(' ', '').lower() + '.txt'

    try:
        tmp = load_chat_dataset(filename)
        word_counts = count_context_and_content_words(tmp, speaker.replace(' ', '').lower())

        rows.append({
            "speaker": speaker,
            "total_context_words": word_counts["total_context_words"],
            "total_content_words": word_counts["total_content_words"],
            "avg_context_words": word_counts["avg_context_words"],
            "avg_content_words": word_counts["avg_content_words"],
            "min_context_words": word_counts["min_context_words"],
            "max_context_words": word_counts["max_context_words"],
            "min_content_words": word_counts["min_content_words"],
            "max_content_words": word_counts["max_content_words"]
        })
    except FileNotFoundError:
        print(f"File not found: {filename}")
    except Exception as e:
        print(f"Error processing {filename}: {e}")

df = pd.DataFrame(rows)


Dataset loaded from: /work/users/s/m/smerrill/Albemarle/dataset/ellenosborne.txt
Total examples: 68
Dataset loaded from: /work/users/s/m/smerrill/Albemarle/dataset/davidoberg.txt
Total examples: 264
Dataset loaded from: /work/users/s/m/smerrill/Albemarle/dataset/grahampaige.txt
Total examples: 896
Dataset loaded from: /work/users/s/m/smerrill/Albemarle/dataset/jonnoalcaro.txt
Total examples: 431
Dataset loaded from: /work/users/s/m/smerrill/Albemarle/dataset/katrinacallsen.txt
Total examples: 486
Dataset loaded from: /work/users/s/m/smerrill/Albemarle/dataset/kateacuff.txt
Total examples: 296
Dataset loaded from: /work/users/s/m/smerrill/Albemarle/dataset/judyle.txt
Total examples: 160


In [163]:
df

Unnamed: 0,speaker,total_context_words,total_content_words,avg_context_words,avg_content_words,min_context_words,max_context_words,min_content_words,max_content_words
0,Ellen Osborne,29067,4723,427.455882,69.455882,27,1918,11,753
1,David Oberg,94504,25058,357.969697,94.916667,0,2025,11,597
2,Graham Paige,223823,64055,249.802455,71.489955,0,2007,11,764
3,Jonno Alcaro,139090,48621,322.714617,112.809745,0,1953,11,828
4,Katrina Callsen,227408,73989,467.917695,152.240741,0,1959,14,990
5,Kate Acuff,103176,27506,348.567568,92.925676,0,2005,11,521
6,Judy Le,60010,15574,375.0625,97.3375,9,1785,12,780


### Synthetic Data

In [180]:
def convert_to_conversation(dataset, role_map):
    """
    Convert examples to the 'conversation' format with cleaned roles.
    Replaces roles using a mapping or single string, and removes 'system' role messages.

    Args:
        dataset (list): List of examples with 'messages' or 'conversation'.
        role_map (dict or str): Role name mapping or a single string to replace all roles.

    Returns:
        list: New dataset in the standardized 'conversation' format.
    """
    new_dataset = []

    for example in dataset:
        # Support both 'messages' and 'conversation' input formats
        messages = example.get("conversation") or example.get("messages") or []
        new_example = {"conversation": []}

        for msg in messages:
            role = msg.get("role", "")
            content = msg.get("content", "")

            if role == "system":
                continue

            if isinstance(role_map, dict):
                new_role = role_map.get(role, role)
            else:
                new_role = role_map  # Replace all with the same role if role_map is str

            new_example["conversation"].append({
                "role": new_role,
                "content": content
            })

        new_dataset.append(new_example)

    return new_dataset


In [172]:
synth_path = '/nas/longleaf/home/smerrill/notebooks/LLM/data/final'

In [181]:
with open(os.path.join(synth_path, 'acuff_qa_pairs_ft.json'), 'r', encoding='utf-8') as f:
    acuff1 = json.load(f)

with open(os.path.join(synth_path, 'YT_acuff_qa_pairs_ft.json'), 'r', encoding='utf-8') as f:
    acuff2 = json.load(f)
    
acuff = acuff1 + acuff2

role_mapping = {
    "user": "Speaker_1",
    "assistant": "kateacuff"}
acuff = convert_to_conversation(acuff, role_mapping)

save_chat_dataset(acuff, dataset_save_path + 'synth_kateacuff.txt')

Dataset saved to: /work/users/s/m/smerrill/Albemarle/dataset/synth_kateacuff.txt
Total examples: 40


In [182]:
acuff

[{'conversation': [{'role': 'Speaker_1',
    'content': 'When was Kate Acuff first elected to the Albemarle County Public School Board?'},
   {'role': 'kateacuff', 'content': 'November 2013'}]},
 {'conversation': [{'role': 'Speaker_1',
    'content': 'What position was Kate Acuff selected as in 2015?'},
   {'role': 'kateacuff', 'content': 'Vice Chair'}]},
 {'conversation': [{'role': 'Speaker_1',
    'content': 'For how many years was Kate Acuff the Chair of the Albemarle County Public School Board?'},
   {'role': 'kateacuff', 'content': 'Three years'}]},
 {'conversation': [{'role': 'Speaker_1',
    'content': 'What fields of experience does Dr. Acuff bring to her service on the Board?'},
   {'role': 'kateacuff',
    'content': 'Law, public health, science, and policy in the private, public, and non-profit sectors'}]},
 {'conversation': [{'role': 'Speaker_1',
    'content': "What is Dr. Acuff's current position with the Partner for Mental Health organization?"},
   {'role': 'kateacuff',

[{'conversation': [{'role': 'Speaker_1',
    'content': 'When was Kate Acuff first elected to the Albemarle County Public School Board?'},
   {'role': 'kateacuff', 'content': 'November 2013'}]},
 {'conversation': [{'role': 'Speaker_1',
    'content': 'What position was Kate Acuff selected as in 2015?'},
   {'role': 'kateacuff', 'content': 'Vice Chair'}]},
 {'conversation': [{'role': 'Speaker_1',
    'content': 'For how many years was Kate Acuff the Chair of the Albemarle County Public School Board?'},
   {'role': 'kateacuff', 'content': 'Three years'}]},
 {'conversation': [{'role': 'Speaker_1',
    'content': 'What fields of experience does Dr. Acuff bring to her service on the Board?'},
   {'role': 'kateacuff',
    'content': 'Law, public health, science, and policy in the private, public, and non-profit sectors'}]},
 {'conversation': [{'role': 'Speaker_1',
    'content': "What is Dr. Acuff's current position with the Partner for Mental Health organization?"},
   {'role': 'kateacuff',

In [183]:
with open(os.path.join(synth_path, 'osborne_qa_pairs_ft.json'), 'r', encoding='utf-8') as f:
    osborne1 = json.load(f)

with open(os.path.join(synth_path, 'YT_osborne_qa_pairs_ft.json'), 'r', encoding='utf-8') as f:
    osborne2 = json.load(f)
osborne = osborne1 + osborne2

role_mapping = {
    "user": "Speaker_1",
    "assistant": "ellenosborne"}
osborne = convert_to_conversation(osborne, role_mapping)


save_chat_dataset(osborne, dataset_save_path + 'synth_ellenosborne.txt')

Dataset saved to: /work/users/s/m/smerrill/Albemarle/dataset/synth_ellenosborne.txt
Total examples: 41


In [184]:
with open(os.path.join(synth_path, 'paige_qa_pairs_ft.json'), 'r', encoding='utf-8') as f:
    paige1 = json.load(f)

with open(os.path.join(synth_path, 'YT_paige_qa_pairs_ft.json'), 'r', encoding='utf-8') as f:
    paige2 = json.load(f)
paige = paige1 + paige2

role_mapping = {
    "user": "Speaker_1",
    "assistant": "grahampaige"}
paige = convert_to_conversation(paige, role_mapping)

save_chat_dataset(paige, dataset_save_path + 'synth_grahampaige.txt')

Dataset saved to: /work/users/s/m/smerrill/Albemarle/dataset/synth_grahampaige.txt
Total examples: 50


In [185]:
with open(os.path.join(synth_path, 'le_qa_pairs_ft.json'), 'r', encoding='utf-8') as f:
    le1 = json.load(f)

with open(os.path.join(synth_path, 'YT_le_qa_pairs_ft.json'), 'r', encoding='utf-8') as f:
    le2 = json.load(f)
le = le1 + le2

role_mapping = {
    "user": "Speaker_1",
    "assistant": "judyle"}
le = convert_to_conversation(le, role_mapping)

save_chat_dataset(le, dataset_save_path + 'synth_judyle.txt')

Dataset saved to: /work/users/s/m/smerrill/Albemarle/dataset/synth_judyle.txt
Total examples: 43


In [186]:
rows = [] 

In [187]:
word_counts = count_context_and_content_words(acuff, 'kateacuff')

rows.append({
    "speaker": 'Kate Acuff',
    "total_context_words": word_counts["total_context_words"],
    "total_content_words": word_counts["total_content_words"],
    "avg_context_words": word_counts["avg_context_words"],
    "avg_content_words": word_counts["avg_content_words"],
    "min_context_words": word_counts["min_context_words"],
    "max_context_words": word_counts["max_context_words"],
    "min_content_words": word_counts["min_content_words"],
    "max_content_words": word_counts["max_content_words"]
})

In [188]:
word_counts = count_context_and_content_words(osborne, 'ellenosborne')

rows.append({
    "speaker": 'Ellen Osborne',
    "total_context_words": word_counts["total_context_words"],
    "total_content_words": word_counts["total_content_words"],
    "avg_context_words": word_counts["avg_context_words"],
    "avg_content_words": word_counts["avg_content_words"],
    "min_context_words": word_counts["min_context_words"],
    "max_context_words": word_counts["max_context_words"],
    "min_content_words": word_counts["min_content_words"],
    "max_content_words": word_counts["max_content_words"]
})

In [189]:
word_counts = count_context_and_content_words(paige, 'grahampaige')

rows.append({
    "speaker": 'Graham Paige',
    "total_context_words": word_counts["total_context_words"],
    "total_content_words": word_counts["total_content_words"],
    "avg_context_words": word_counts["avg_context_words"],
    "avg_content_words": word_counts["avg_content_words"],
    "min_context_words": word_counts["min_context_words"],
    "max_context_words": word_counts["max_context_words"],
    "min_content_words": word_counts["min_content_words"],
    "max_content_words": word_counts["max_content_words"]
})

In [190]:
word_counts = count_context_and_content_words(le, 'judyle')

rows.append({
    "speaker": 'Judy Le',
    "total_context_words": word_counts["total_context_words"],
    "total_content_words": word_counts["total_content_words"],
    "avg_context_words": word_counts["avg_context_words"],
    "avg_content_words": word_counts["avg_content_words"],
    "min_context_words": word_counts["min_context_words"],
    "max_context_words": word_counts["max_context_words"],
    "min_content_words": word_counts["min_content_words"],
    "max_content_words": word_counts["max_content_words"]
})

In [191]:
synthetic_df = pd.DataFrame(rows)

In [192]:
synthetic_df

Unnamed: 0,speaker,total_context_words,total_content_words,avg_context_words,avg_content_words,min_context_words,max_context_words,min_content_words,max_content_words
0,Kate Acuff,468,216,11.7,5.4,7,17,1,17
1,Ellen Osborne,422,409,10.292683,9.97561,4,19,1,35
2,Graham Paige,529,354,10.58,7.08,4,16,1,17
3,Judy Le,462,445,10.744186,10.348837,6,17,1,27


### Train Test Split

In [193]:
def train_test_split(member: str, test_size: float = 0.2, seed: int = 42, data_path: str = '/work/users/s/m/smerrill/Albemarle/dataset') -> Tuple[List[dict], List[dict]]:
    """
    Splits the dataset into training and test sets. Synthetic data is always added to the training set.

    Parameters:
    - member: The name identifier for the board member.
    - test_size: Proportion of the real (non-synthetic) data to include in the test split.
    - seed: Random seed for reproducibility.
    - data_path: Base directory for the dataset files.

    Returns:
    - A tuple (train_data, test_data)
    """
    real_data, synth_data = [], []

    if member == 'acuff':
        real_data = load_chat_dataset(os.path.join(data_path, 'kateacuff.txt'))
        synth_data = load_chat_dataset(os.path.join(data_path, 'synth_kateacuff.txt'))
    elif member == 'osborne':
        real_data = load_chat_dataset(os.path.join(data_path, 'ellenosborne.txt'))
        synth_data = load_chat_dataset(os.path.join(data_path, 'synth_ellenosborne.txt'))
    elif member == 'paige':
        real_data = load_chat_dataset(os.path.join(data_path, 'grahampaige.txt'))
        synth_data = load_chat_dataset(os.path.join(data_path, 'synth_grahampaige.txt'))
    elif member == 'le':
        real_data = load_chat_dataset(os.path.join(data_path, 'judyle.txt'))
        synth_data = load_chat_dataset(os.path.join(data_path, 'synth_judyle.txt'))
    elif member == 'callsen':
        real_data = load_chat_dataset(os.path.join(data_path, 'katrinacallsen.txt'))
    elif member == 'oberg':
        real_data = load_chat_dataset(os.path.join(data_path, 'davidoberg.txt'))
    elif member == 'alcaro':
        real_data = load_chat_dataset(os.path.join(data_path, 'jonnoalcaro.txt'))
    else:
        raise ValueError(f"Unknown member: {member}")

    if not 0 < test_size < 1:
        raise ValueError("test_size must be a float between 0 and 1.")

    # Shuffle and split only the real data
    random.seed(seed)
    shuffled_real = real_data.copy()
    random.shuffle(shuffled_real)

    split_index = int(len(shuffled_real) * (1 - test_size))
    train_data = shuffled_real[:split_index] + synth_data
    test_data = shuffled_real[split_index:]

    return train_data, test_data

In [194]:
train_data, test_data = train_test_split('acuff')
train_data, test_data = train_test_split('osborne')
train_data, test_data = train_test_split('paige')
train_data, test_data = train_test_split('le')
train_data, test_data = train_test_split('callsen')
train_data, test_data = train_test_split('oberg')
train_data, test_data = train_test_split('alcaro')

Dataset loaded from: /work/users/s/m/smerrill/Albemarle/dataset/kateacuff.txt
Total examples: 296
Dataset loaded from: /work/users/s/m/smerrill/Albemarle/dataset/synth_kateacuff.txt
Total examples: 40
Dataset loaded from: /work/users/s/m/smerrill/Albemarle/dataset/ellenosborne.txt
Total examples: 68
Dataset loaded from: /work/users/s/m/smerrill/Albemarle/dataset/synth_ellenosborne.txt
Total examples: 41
Dataset loaded from: /work/users/s/m/smerrill/Albemarle/dataset/grahampaige.txt
Total examples: 896
Dataset loaded from: /work/users/s/m/smerrill/Albemarle/dataset/synth_grahampaige.txt
Total examples: 50
Dataset loaded from: /work/users/s/m/smerrill/Albemarle/dataset/judyle.txt
Total examples: 160
Dataset loaded from: /work/users/s/m/smerrill/Albemarle/dataset/synth_judyle.txt
Total examples: 43
Dataset loaded from: /work/users/s/m/smerrill/Albemarle/dataset/katrinacallsen.txt
Total examples: 486
Dataset loaded from: /work/users/s/m/smerrill/Albemarle/dataset/davidoberg.txt
Total examp

In [195]:
train_data

[{'conversation': [{'role': 'Speaker_1',
    'content': 'Ms. Lee. Yes. Ms. Osborne. Yes. Mr. Page. Yes. Motion carried.'},
   {'role': 'Speaker_3',
    'content': "Thank you. All right, moving next to our spotlight on education. I don't have in my notes who is going to introduce the spotlight tonight."},
   {'role': 'Speaker_4',
    'content': "I'm going to be doing this. Thank you, Chair Page. I'm delighted to introduce tonight's spotlight on education because I'm the daughter of a public school science teacher who reached out to broaden his experiences in science and creatively engaged students. So spotlighting one of Albemarle County's own is a real treat. In 1994, the US Congress signed into law the Albert Einstein Distinguished Educator Fellowship Act, which provided a unique opportunity for highly accomplished K-12 teachers to serve in a national setting, one of which is a Smithsonian Institute. Teachers are given access to and will leave with a national network of leaders and pr