In [194]:
import re
import json
from collections import defaultdict
import difflib
import textwrap
import os

import sys
from rapidfuzz import process
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from typing import List, Tuple

from rapidfuzz import fuzz
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from typing import List, Dict
from rapidfuzz import fuzz

### 1. Clean transcripts

In [195]:
# Predefined canonical names
target_names = [
    'Ellen Osborne', 'David Oberg', 'Graham Paige', 'Jonno Alcaro',
    'Katrina Callsen', 'Kate Acuff', 'Judy Le'
]

# Known aliases → canonical name
alias_map = {
    "Katherin Acuff": "Kate Acuff",
    "Katherine Acuff": "Kate Acuff",
    "Kate Acuff": "Kate Acuff",
    "Jonathan Alcaro": "Jonno Alcaro",
    "Jon Alcaro": "Jonno Alcaro",
    "Jonno Alcaro": "Jonno Alcaro"
}

# Merge all names used for matching
known_names = list(set(target_names + list(alias_map.keys())))

def normalize_name(name):
    # Remove extra characters, insert space before capital letters if needed
    name = re.sub(r'[^a-zA-Z ]+', '', name)  # remove punctuation
    name = re.sub(r'([a-z])([A-Z])', r'\1 \2', name)  # camelCase to spaced
    return name.strip().title()


def build_name_map(name_variants, known_names, alias_map, threshold=75):
    name_map = {}
    for name in name_variants:
        normalized_name = normalize_name(name)

        # Alias check
        if normalized_name in alias_map:
            name_map[name] = alias_map[normalized_name]
            continue

        # Fuzzy match
        result = process.extractOne(normalized_name, known_names, score_cutoff=threshold)
        if result:
            match, _, _ = result
            canonical = alias_map.get(match, match)
            name_map[name] = canonical
        else:
            # Fallback
            name_map[name] = normalized_name
    return name_map


def clean_transcripts(data):
    cleaned = []

    for entry in data:
        speaker = entry['speaker'].strip().lower()
        text = entry['text'].strip()

        # Normalize whitespace
        text = re.sub(r'\s+', ' ', text)

        # Count question marks and short Q&A-like sentences
        question_count = text.count('?')
        sentence_count = len(re.findall(r'\w+[^.?!]*[.?!]', text))

        # Rule 1: Too many Q&A pairs in one segment
        if question_count >= 2 and sentence_count >= 2:
            speaker = 'unknownspeaker'

        # Rule 2: Text too short
        elif len(text.split()) <= 10:
            speaker = 'unknownspeaker'

        # Rule 3: Multiple "?" or alternating Q&A markers in one line
        elif len(re.findall(r'\w+\?', text)) >= 2 and 'yes' in text.lower():
            speaker = 'unknownspeaker'

        # Rule 4: No alphabetic content
        elif not re.search(r'[a-zA-Z]', text):
            speaker = 'unknownspeaker'
        elif speaker == 'nospeaker':
            speaker = 'unknownspeaker'

        cleaned.append({'speaker': speaker, 'text': text})

    return merge_consecutive_speakers(cleaned)

def merge_consecutive_speakers(transcript):
    """
    Merges consecutive transcript entries from the same speaker.
    
    Args:
        transcript (list of dict): Each dict has 'speaker' and 'text'.
    
    Returns:
        list of dict: Cleaned list with merged consecutive speaker entries.
    """
    if not transcript:
        return []

    merged = []
    current_speaker = transcript[0]['speaker']
    current_text = transcript[0]['text'].strip()

    for entry in transcript[1:]:
        speaker = entry['speaker']
        text = entry['text'].strip()

        if speaker == current_speaker:
            current_text += " " + text
        else:
            merged.append({'speaker': current_speaker, 'text': current_text})
            current_speaker = speaker
            current_text = text

    # Append the final block
    merged.append({'speaker': current_speaker, 'text': current_text})
    return merged


In [196]:
transcript_path = '/work/users/s/m/smerrill/Albemarle/transcripts'
all_transcripts = os.listdir(transcript_path)
clean_transcript_path = '/work/users/s/m/smerrill/Albemarle/cleantranscripts'
os.makedirs(clean_transcript_path, exist_ok=True)

for transcript in all_transcripts:
    transcript_file = os.path.join(transcript_path, transcript)
    text = np.load(transcript_file, allow_pickle=True)
    detected_names = set([ x['speaker'] for x in text])
    name_map = build_name_map(detected_names, known_names, alias_map)
    cleaned = clean_transcripts(text)
    np.save(os.path.join(clean_transcript_path, transcript), cleaned)

In [197]:
cleaned

[{'speaker': 'grahampaige',
  'text': 'All right, it is 630. I called this meeting of the Albemarle County School Board for September 24 to order. And coming out of our closed session, Mrs. Carlson, do you have a motion? I do.'},
 {'speaker': 'katrinacallsen',
  'text': 'Rachel Wainwright-van Kesselmann, Ph.D. : Chair page I move that the board certified by recorded vote that to the best of each board members knowledge only public business matters lawfully exempted from the open meeting requirements of the Virginia freedom of information act. Rachel Wainwright-van Kesselmann, Ph.D. : and identified in the motion authorizing the closed meeting were heard discussed are considered in the closed meeting.'},
 {'speaker': 'unknownspeaker',
  'text': "Second. OK, it has been moved by Mrs. Colson and seconded by Ms. Del Caro that we certify that we only talked about what was in the original motion to go into closed session. Is there any discussion? OK, Ms. Johnston, could you take the roll cal

### 2. Make Datasets

In [285]:
def build_conversation_from_transcript(
    transcript: List[Dict[str, str]], 
    tokenizer_name: str = "unsloth/Llama-3.3-70B-Instruct-bnb-4bit", 
    target_speaker: str = "grahampaige", 
    max_tokens: int = 850,
    min_utterance_tokens: int = 100,
    fuzzy_threshold: int = 75
) -> List[str]:
    """
    Splits a transcript into non-overlapping segments that:
      - Contain at least one utterance from the target speaker (using fuzzy matching)
      - Contain at least one utterance from a different speaker
      - Are <= max_tokens total
      - Contain only utterances >= min_utterance_tokens
      - Consecutive utterances from same speaker are merged
    """
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    segments = []
    current_segment = []
    current_token_count = 0
    speakers_in_segment = set()

    def is_target_speaker(speaker: str) -> bool:
        return fuzz.ratio(speaker.lower(), target_speaker.lower()) >= fuzzy_threshold

    def flush_segment():
        nonlocal current_segment, current_token_count, speakers_in_segment
        if not current_segment:
            return
        has_target = any(is_target_speaker(utt['speaker']) for utt in current_segment)
        has_other = any(not is_target_speaker(utt['speaker']) for utt in current_segment)
        if has_target and has_other:
            # Merge consecutive same-speaker lines
            merged = []
            prev_speaker = None
            buffer = []
            for utt in current_segment:
                if utt['speaker'] == prev_speaker:
                    buffer.append(utt['text'])
                else:
                    if buffer:
                        merged.append(f"{prev_speaker}: {' '.join(buffer)}")
                    prev_speaker = utt['speaker']
                    buffer = [utt['text']]
            if buffer:
                merged.append(f"{prev_speaker}: {' '.join(buffer)}")
            segments.append("\n".join(merged))
        current_segment = []
        current_token_count = 0
        speakers_in_segment = set()

    for entry in transcript:
        speaker = entry.get("speaker", "").strip()
        text = entry.get("text", "").strip()
        if not text:
            continue

        entry_text = f"{speaker}: {text}"
        entry_token_count = len(tokenizer.tokenize(entry_text))

        # Skip too short
        if entry_token_count < min_utterance_tokens:
            continue

        if current_token_count + entry_token_count > max_tokens:
            flush_segment()

        current_segment.append({'speaker': speaker, 'text': text})
        current_token_count += entry_token_count
        speakers_in_segment.add(speaker)

    flush_segment()
    return segments

def build_completion_from_transcript(
    transcript: List[Dict[str, str]],
    tokenizer_name: str = "unsloth/Llama-3.3-70B-Instruct-bnb-4bit",
    target_speaker: str = "grahampaige",
    max_total_tokens: int = 850,
    min_completion_tokens: int = 20,
    fuzzy_threshold: int = 75
) -> List[Dict[str, str]]:
    """
    Builds a completion-style test set:
      - prompt = prior dialogue context
      - completion = next utterance by target speaker
    """
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    test_set = []

    def is_target_speaker(speaker: str) -> bool:
        return fuzz.ratio(speaker.lower(), target_speaker.lower()) >= fuzzy_threshold

    for i, entry in enumerate(transcript):
        speaker = entry.get("speaker", "").strip()
        text = entry.get("text", "").strip()

        if not text or not is_target_speaker(speaker):
            continue

        completion = f"{speaker}: {text}"
        completion_tokens = len(tokenizer.tokenize(completion))

        # Skip if target utterance is too short
        if completion_tokens < min_completion_tokens:
            continue

        # Gather context up to max_total_tokens (including the completion)
        context_entries = []
        context_token_count = completion_tokens

        j = i - 1
        while j >= 0 and context_token_count < max_total_tokens:
            prev_entry = transcript[j]
            prev_text = f"{prev_entry.get('speaker', '').strip()}: {prev_entry.get('text', '').strip()}"
            tokens = len(tokenizer.tokenize(prev_text))

            if context_token_count + tokens > max_total_tokens:
                break

            context_entries.insert(0, prev_text)
            context_token_count += tokens
            j -= 1

        # Only add if we have non-empty context
        if context_entries:
            prompt = "\n".join(context_entries)
            test_set.append({
                "prompt": prompt.strip(),
                "completion": completion.strip()
            })

    return test_set


def format_chat(prompt_text, target_speaker):
    lines = prompt_text.split('\n')
    text = ''
    current_block_type = None
    block_lines = []

    def flush_block():
        nonlocal text, block_lines, current_block_type
        if not block_lines:
            return
        if current_block_type == 'assistant':
            text += "<|start_header_id|>assistant<|end_header_id|>\n\n"
        else:
            text += "<|start_header_id|>user<|end_header_id|>\n\n"
        text += '\n'.join(block_lines) + '<|eot_id|>\n\n'
        block_lines.clear()

    for line in lines:
        if not line.strip():
            continue
        speaker_name = line.split(':')[0].strip()
        block_type = 'assistant' if speaker_name == target_speaker else 'user'

        if block_type != current_block_type:
            flush_block()
            current_block_type = block_type

        block_lines.append(line)

    flush_block()
    return text

def count_user_assistant_tokens(
    dataset: List[str],
    tokenizer_name: str = "unsloth/Llama-3.3-70B-Instruct-bnb-4bit"
) -> Tuple[int, int, int]:
    """
    Counts tokens in a dataset for assistant vs user based on header tags.

    Args:
        dataset (List[str]): list of string-formatted examples.
        tokenizer_name (str): tokenizer to use.

    Returns:
        (context_tokens, target_tokens, total_tokens)
    """
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

    context_tokens = 0
    target_tokens = 0
    total_tokens = 0

    # Match blocks like <|start_header_id|>user<|end_header_id|>\n\n ... <|eot_id|>
    pattern = re.compile(
        r"<\|start_header_id\|>(user|assistant)<\|end_header_id\|>\s*\n\n(.*?)<\|eot_id\|>",
        re.DOTALL,
    )

    for example in dataset:
        for role, text in pattern.findall(example):
            tokens = len(tokenizer.tokenize(text.strip()))
            total_tokens += tokens
            if role == "user":
                context_tokens += tokens
            elif role == "assistant":
                target_tokens += tokens

    return context_tokens, target_tokens, total_tokens


def count_prompt_completion_tokens(
    dataset: List[Dict[str, str]],
    tokenizer_name: str = "unsloth/Llama-3.3-70B-Instruct-bnb-4bit"
) -> Tuple[int, int, int]:
    """
    Count total tokens in prompt (context) and completion (target) fields.

    Args:
        dataset: List of dictionaries with 'prompt' and 'completion' keys.
        tokenizer_name: Name of the tokenizer to use.

    Returns:
        Tuple of (context_tokens, target_tokens, total_tokens)
    """
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

    context_tokens = 0
    target_tokens = 0

    for item in dataset:
        prompt_text = item.get("prompt", "")
        completion_text = item.get("completion", "")
        
        context_tokens += len(tokenizer.tokenize(prompt_text.strip()))
        target_tokens += len(tokenizer.tokenize(completion_text.strip()))

    total_tokens = context_tokens + target_tokens
    return context_tokens, target_tokens, total_tokens

def collect_target_speaker_chunks(
    cleaned: List[Dict[str, str]],
    tokenizer_name: str = "unsloth/Llama-3.3-70B-Instruct-bnb-4bit",
    target_speaker: str = "grahampaige",
    fuzzy_threshold: int = 75,
    min_thresh: int = 500,
    max_thresh: int = 800,
    min_target: int = 20,
) -> List[List[str]]:
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

    examples = []
    all_text = []
    total_token = 0
    target_token_count = 0
    prev_speaker = None

    for i in range(len(cleaned)):
        speaker = cleaned[i].get("speaker", "").strip()
        text = cleaned[i].get("text", "").strip()
        if not speaker or not text:
            continue

        # If same speaker as previous line, just add text; else prefix with speaker
        if speaker == prev_speaker:
            line = text
        else:
            line = f"{speaker}: {text}"
            prev_speaker = speaker

        token_count = len(tokenizer.tokenize(line))
        total_token += token_count

        if fuzz.ratio(speaker.lower(), target_speaker.lower()) >= fuzzy_threshold:
            line = f"{target_speaker}: {text}"
            target_token_count += token_count

        all_text.append(line)

        if total_token >= min_thresh:
            if total_token <= max_thresh and target_token_count >= min_target:
                examples.append('\n'.join(all_text))

            # reset window
            all_text = []
            total_token = 0
            target_token_count = 0
            prev_speaker = None  # reset previous speaker too

    return examples

In [286]:
dataset_save_path = '/work/users/s/m/smerrill/Albemarle/dataset/'
clean_transcript_path = '/work/users/s/m/smerrill/Albemarle/cleantranscripts'
all_transcripts = os.listdir(clean_transcript_path)

# Ensure consistent shuffling
train_transcripts, test_transcripts = train_test_split(
    all_transcripts, test_size=0.1, random_state=42)

## Train data is conversation style

In [287]:
train_datasets = []
for speaker in target_names:
    speaker = speaker.lower().replace(' ', '')
    print(speaker)
    examples = []
    for transcript in train_transcripts:
        cleaned_transcript = os.path.join(clean_transcript_path, transcript)

        cleaned = np.load(cleaned_transcript, allow_pickle=True)

        cleaned_segments = collect_target_speaker_chunks(cleaned, target_speaker=speaker)

        examples += ['<|begin_of_text|>' + format_chat(segment, speaker) for segment in cleaned_segments]

    
    save_name = dataset_save_path + speaker.replace(' ', '').lower() + '_train.npy'    
    np.save(save_name, examples)
    train_datasets.append(save_name)
    print('---------------------------------------')

ellenosborne
---------------------------------------
davidoberg
---------------------------------------
grahampaige
---------------------------------------
jonnoalcaro
---------------------------------------
katrinacallsen
---------------------------------------
kateacuff
---------------------------------------
judyle
---------------------------------------


In [288]:
print(speaker)

for example in examples:
    print(example)
    print('---------------------------------------')

judyle
<|begin_of_text|><|start_header_id|>user<|end_header_id|>

katrinacallser: Patrick, I have a quick question. Yes. How does this compare to Charlottesville's plan? I guess what I'm asking you with the February date, I'm just I'm wondering because I just read in the newspaper last week about how Charlottesville is voting next week to open in January or something along those lines, I could be misinformed.
patrickmclaughlin: So I would say I am I'm not I saw that Charlottesville had made some decisions, but I'm not familiar with the details of those so I can research that a little bit and and get back to you with some information about that.
unknownspeaker: Okay.<|eot_id|>

<|start_header_id|>assistant<|end_header_id|>

judyle: I just, I have a comment and a request, I guess. I do really appreciate the shift in timeline so that people have more information. And I appreciate the stakeholders being a part of the process. So I know that we're all, you know, designing the plane as we fl

In [257]:
print(collect_target_speaker_chunks(cleaned)[0])

grahampaige: first meeting is called to order. Ms. Colson, do you have a motion?
katrinacallsen: I do. I move that the Albemarle County Public School Board go into a closed meeting as authorized by the Virginia Freedom of Information Act section 2.2 dash 3711 a as a code of Virginia under subsection one. For the discussion consideration or interviews with prospective candidates for employment and the assignment appointment promotion performance to motion salaries disciplining a resignation of specific public officers appointees or employees of any public body. and subsection eight for consultation with legal counsel employed or retained by public body regarding specific legal matters requiring the provision of legal advice by such counsel.
grahampaige: Second. Moved by Ms. Carlson, seconded by Ms. DelCaro that we go into closed session. Ms. Johnston, we'll take a roll call on that.
unknownspeaker: Ms. Osborne? Yes. Ms. Carlson? Yes.
jenniferjohnston: Ms. Lee? Yes, Dr. a cuff. Yes, Mr. 

### But also save it as a chat completion dataset to see if we can overfit

In [304]:
for speaker in target_names:
    speaker = speaker.lower().replace(' ', '')

    print(speaker)
    examples = []
    prompts = []
    completions = []

    for transcript in train_transcripts:
        cleaned_transcript = os.path.join(clean_transcript_path, transcript)

        cleaned = np.load(cleaned_transcript, allow_pickle=True)
        cleaned_segments = build_completion_from_transcript(cleaned, target_speaker=speaker)

        prompts += ['<|begin_of_text|>' + format_chat(segment['prompt'], speaker) + f"<|start_header_id|>assistant<|end_header_id|>\n\n{speaker}:" for segment in cleaned_segments]
        completions += [segment['completion'].replace(f"{speaker}:", "") for segment in cleaned_segments]
        
    
    examples = [{'prompt':prompts[i], 'completion':completions[i]} for i in range(len(prompts))]
    save_name = dataset_save_path + speaker.replace(' ', '').lower() + '_train_completion.npy'    
    np.save(save_name, examples)
    
    print('---------------------------------------')

ellenosborne
---------------------------------------
davidoberg
---------------------------------------
grahampaige
---------------------------------------
jonnoalcaro
---------------------------------------
katrinacallsen
---------------------------------------
kateacuff
---------------------------------------
judyle
---------------------------------------


In [305]:
for i in range(len(prompts)):
    print(prompts[i])
    print('----')
    print(completions[i])
    print('----------------------------------')


<|begin_of_text|><|start_header_id|>user<|end_header_id|>

grahampaige: And our student reps, tonight we have both our regular rep along with an alternate that had not had a chance to sit in on any of our meetings. So first, Will Trout.
unknownspeaker: Will Trout, student representative. Okay, and tell us again where your school, Will. Yes, Monticello. Okay, and Elijah Witt. Elijah Witt, alternate student representative at Western Albemarle.
grahampaige: Okay, thank you. And Elijah, we are very pleased to have you with us tonight, and you'll also be appearing at our December meetings, so thanks so much. We will now entertain a moment, I mean a motion rather, for the approval of the agenda.
unknownspeaker: I move that we approve the agenda. Second. Second.
grahampaige: OK, moved by Ms. DeAlcaro, second by Dr. Acuff, that we approve the agenda. Ms. Johnston.
unknownspeaker: Ms. Osborne. Yes. Ms. Galston. Yes. Dr. Acuff? Yes. Mr. Alcaro? Yes. Mr. Oberg? Yes. Ms. Lee? Yes. Mr. Page?
graham

### Test is only completion style

In [306]:
test_datasets = []
for speaker in target_names:
    speaker = speaker.lower().replace(' ', '')
    print(speaker)
    examples = []
    prompts = []
    completions = []

    for transcript in test_transcripts:
        cleaned_transcript = os.path.join(clean_transcript_path, transcript)

        cleaned = np.load(cleaned_transcript, allow_pickle=True)
        cleaned_segments = build_completion_from_transcript(cleaned, target_speaker=speaker)

        prompts += ['<|begin_of_text|>' + format_chat(segment['prompt'], speaker) + f"<|start_header_id|>assistant<|end_header_id|>\n\n{speaker}:" for segment in cleaned_segments]
        completions += [segment['completion'].replace(f"{speaker}:", "") for segment in cleaned_segments]

    examples = [{'prompt':prompts[i], 'completion':completions[i]} for i in range(len(prompts))]
    save_name = dataset_save_path + speaker.replace(' ', '').lower() + '_test.npy'    
    np.save(save_name, examples)
    
    test_datasets.append(save_name)
    print('---------------------------------------')

ellenosborne
---------------------------------------
davidoberg
---------------------------------------
grahampaige
---------------------------------------
jonnoalcaro
---------------------------------------
katrinacallsen
---------------------------------------
kateacuff
---------------------------------------
judyle
---------------------------------------


In [309]:
print(speaker)
print(examples[0]['prompt'])

judyle
<|begin_of_text|><|start_header_id|>user<|end_header_id|>

grahampaige: The only thing that I can add to what you were saying is the benefits package that Outclassified, that our staff has available. And so we were really impressed with some of the things in the benefit package and wanted to make sure that maybe all of our employees were aware of all of those benefits. So like John was saying, a better way of probably communicating or making sure that all of the employees are really aware of all of the benefits.
jonnoalcaro: Thank you so much. Go ahead. I'm sorry, Mr. I was just going to say both of us were really appreciative of Mr. Redding's willingness to listen and to clarify things we were saying when we weren't saying them to in such a clarified manner.
unknownspeaker: True.
claekelser: Thank you. If somebody from one of the other groups would like to jump in and share some of your thoughts.<|eot_id|>

<|start_header_id|>assistant<|end_header_id|>

judyle:


In [None]:
print(format_chat(test_segments['prompt'], speaker))

In [72]:
test_segments

{'prompt': "mayakumaza: I think the group is up to the task. They have committed to stay here at 9 p.m. 9 p.m. We thank you for that. We do.\nnospeaker: We really do.\nmayakumaza: So thank you, school board members for providing the guidance. I think it'll be really helpful as we begin our process. Just some feedback to you, Maya.\nkatherineacuff: It was really good, actually, having these groups with the long-range planning committee members. Because we usually are just up there at the front saying, do this, and come back with us. So it was good. Good idea. Whosever idea it was.\nnospeaker: Yeah, that was great. Right. So thanks, Maya, and also thanks to the Long-Range Planning Committee members who were with us tonight. Is that it, Maya, anymore? That's it. OK. All right. So we are down to other business from board members or the superintendent. Any other business for tonight?",
 'completion': "judyle: I did have one question. OK. It's related to our COVID discussion before, but I di

## 3. Dataset Metastats

### Train Dataset Size

In [205]:
rows = []

for speaker in target_names:
    filename = dataset_save_path + speaker.replace(' ', '').lower() + '_train.npy'

    try:
        tmp = np.load(filename)
        context_tokens, target_tokens, total_tokens = count_user_assistant_tokens(tmp)

        rows.append({
            "speaker": speaker,
            "context_tokens": context_tokens,
            "target_tokens": target_tokens,
            "total_tokens": total_tokens,
        })
    except FileNotFoundError:
        print(f"File not found: {filename}")
    except Exception as e:
        print(f"Error processing {filename}: {e}")

df = pd.DataFrame(rows)
df

Unnamed: 0,speaker,context_tokens,target_tokens,total_tokens
0,Ellen Osborne,36304,3175,39479
1,David Oberg,94374,19851,114225
2,Graham Paige,218411,43149,261560
3,Jonno Alcaro,113919,25420,139339
4,Katrina Callsen,177097,40517,217614
5,Kate Acuff,116113,14916,131029
6,Judy Le,65396,12144,77540


### Test Dataset Size

In [206]:
rows = []

for speaker in target_names:
    filename = dataset_save_path + speaker.replace(' ', '').lower() + '_test.npy'

    try:
        tmp = np.load(filename, allow_pickle=True)
        context_tokens, target_tokens, total_tokens = count_prompt_completion_tokens(tmp)
        rows.append({
            "speaker": speaker,
            "context_tokens": context_tokens,
            "target_tokens": target_tokens,
            "total_tokens": total_tokens,
        })
    except FileNotFoundError:
        print(f"File not found: {filename}")
    except Exception as e:
        print(f"Error processing {filename}: {e}")

df = pd.DataFrame(rows)

In [207]:
df

Unnamed: 0,speaker,context_tokens,target_tokens,total_tokens
0,Ellen Osborne,2096,684,2780
1,David Oberg,7446,1016,8462
2,Graham Paige,28023,3394,31417
3,Jonno Alcaro,44561,7290,51851
4,Katrina Callsen,1274,120,1394
5,Kate Acuff,11356,1302,12658
6,Judy Le,2112,729,2841


### Synthetic Data

In [144]:
def convert_chat_to_instruction_format(data: list, target_speaker) -> list:
    """
    Convert OpenAI-style chat data (with 'messages') to instruction format used for training.

    Each item will be converted into:
    {
        'prompt': '<|start_header_id|>user<|end_header_id|>\n\n<user_text><|eot_id|>\n\n',
        'completion': '<|start_header_id|>assistant<|end_header_id|>\n\n<assistant_text><|eot_id|>\n\n'
    }
    """
    converted = []

    for item in data:
        messages = item.get("messages", [])
        
        # Find user and assistant messages
        user_msg = next((m["content"] for m in messages if m["role"] == "user"), "").strip()
        assistant_msg = next((m["content"] for m in messages if m["role"] == "assistant"), "").strip()
        
        # Build formatted prompt/completion
        prompt = "<|start_header_id|>user<|end_header_id|>\n\n" + user_msg + "<|eot_id|>\n\n"
        completion = f"<|start_header_id|>assistant<|end_header_id|>\n\n{target_speaker}:" + assistant_msg + "<|eot_id|>\n\n"

        converted.append('<|begin_of_text|>' + prompt + completion)

    return converted


In [145]:
synth_path = '/nas/longleaf/home/smerrill/notebooks/LLM/data/final'

In [146]:
with open(os.path.join(synth_path, 'acuff_qa_pairs_ft.json'), 'r', encoding='utf-8') as f:
    acuff1 = json.load(f)

with open(os.path.join(synth_path, 'YT_acuff_qa_pairs_ft.json'), 'r', encoding='utf-8') as f:
    acuff2 = json.load(f)
    
acuff = acuff1 + acuff2

acuff = convert_chat_to_instruction_format(acuff, 'kateacuff')
np.save(dataset_save_path + 'synth_kateacuff.npy', acuff)

In [147]:
with open(os.path.join(synth_path, 'osborne_qa_pairs_ft.json'), 'r', encoding='utf-8') as f:
    osborne1 = json.load(f)

with open(os.path.join(synth_path, 'YT_osborne_qa_pairs_ft.json'), 'r', encoding='utf-8') as f:
    osborne2 = json.load(f)
osborne = osborne1 + osborne2

osborne = convert_chat_to_instruction_format(osborne, 'ellenosborne')
np.save(dataset_save_path + 'synth_ellenosborne.npy', osborne)

In [148]:
with open(os.path.join(synth_path, 'paige_qa_pairs_ft.json'), 'r', encoding='utf-8') as f:
    paige1 = json.load(f)

with open(os.path.join(synth_path, 'YT_paige_qa_pairs_ft.json'), 'r', encoding='utf-8') as f:
    paige2 = json.load(f)
paige = paige1 + paige2

paige = convert_chat_to_instruction_format(paige, 'grahampaige')
np.save(dataset_save_path + 'synth_grahampaige.npy', paige)

In [149]:
with open(os.path.join(synth_path, 'le_qa_pairs_ft.json'), 'r', encoding='utf-8') as f:
    le1 = json.load(f)

with open(os.path.join(synth_path, 'YT_le_qa_pairs_ft.json'), 'r', encoding='utf-8') as f:
    le2 = json.load(f)
le = le1 + le2

le = convert_chat_to_instruction_format(le, 'grahampaige')
np.save(dataset_save_path + 'synth_judyle.npy', le)

In [150]:
le[0]

'<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nHow many doors did Judy Le knock on during her campaign?<|eot_id|>\n\n<|start_header_id|>assistant<|end_header_id|>\n\ngrahampaige:over 1,000<|eot_id|>\n\n'