In [16]:
import pandas as pd
import numpy as np
import re

In [51]:
def print_segments(segments):
    for segment in segments:
        print(segment['speaker'] + ': ' + segment['text'])
        
def clean_text(text):
    """Clean text by removing disfluencies and normalizing whitespace/punctuation."""
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'\s([?.!,"])', r'\1', text)
    return text

def build_dialogue_dataset(segments, target_speaker='SPEAKER_06'):
    """
    Build dataset with:
    - context: prior dialogue (speaker-tagged when changing)
    - response: one or more consecutive turns by target_speaker
    """
    dataset = []
    context_parts = []
    prev_speaker = None
    i = 0

    while i < len(segments):
        segment = segments[i]
        speaker = segment.get('speaker')
        text = clean_text(segment.get('text', ''))

        if not text:
            i += 1
            continue

        if speaker == target_speaker:
            # Collect all consecutive target_speaker turns
            response_parts = [text]
            i += 1
            while i < len(segments) and segments[i]['speaker'] == target_speaker:
                next_text = clean_text(segments[i]['text'])
                if next_text:
                    response_parts.append(next_text)
                i += 1
            if context_parts:
                dataset.append({
                    "context": ' \n'.join(context_parts),
                    "response": ' '.join(response_parts)
                })
            context_parts = []
            prev_speaker = None  # reset after Alicia speaks
        else:
            if speaker != prev_speaker:
                context_parts.append(f"{speaker}: {text}")
            else:
                context_parts.append(text)
            prev_speaker = speaker
            i += 1

    return dataset

def count_words_in_dataset(dataset):
    """
    Counts total number of words in 'context' and 'response' fields of a dialogue dataset.
    
    Args:
        dataset (list): List of dicts with 'context' and 'response' keys.

    Returns:
        dict: {'context_word_count': int, 'response_word_count': int}
    """
    context_word_count = 0
    response_word_count = 0

    for entry in dataset:
        context = entry.get('context', '')
        response = entry.get('response', '')
        context_word_count += len(context.split())
        response_word_count += len(response.split())

    return {
        'context_word_count': context_word_count,
        'response_word_count': response_word_count
    }

### Jon

In [56]:
jon_segments = np.load('transcripts/jon.npy', allow_pickle=True)
jon_dataset = build_dialogue_dataset(jon_segments, target_speaker='SPEAKER_06')

jon_word_counts = count_words_in_dataset(jon_dataset)
print(jon_word_counts)

FileNotFoundError: [Errno 2] No such file or directory: 'transcripts/jon.npy'

### Maria

In [57]:
maria_segments = np.load('transcripts/maria.npy', allow_pickle=True)
maria_dataset = build_dialogue_dataset(maria_segments, target_speaker='SPEAKER_06')
np.save('datasets/maria_dataset.npy', maria_dataset)

maria_word_counts = count_words_in_dataset(maria_dataset)
print(maria_word_counts)

FileNotFoundError: [Errno 2] No such file or directory: 'transcripts/maria.npy'

### Laurie

In [58]:
laurie_segments = np.load('transcripts/laurie.npy', allow_pickle=True)
laurie_dataset = build_dialogue_dataset(laurie_segments, target_speaker='SPEAKER_03')
np.save('datasets/laurie_dataset.npy', laurie_dataset)

# print_segments(laurie_segments)
# Laurie = SPEAKER_03

laurie_word_counts = count_words_in_dataset(laurie_dataset)
print(laurie_word_counts)

{'context_word_count': 730, 'response_word_count': 2209}


### Alicia

In [59]:
alicia_segments = np.load('transcripts/alicia.npy', allow_pickle=True)
alicia_dataset = build_dialogue_dataset(alicia_segments, target_speaker='SPEAKER_06')
np.save('datasets/alicia_dataset.npy', alicia_dataset)

# print_segments(alicia_segments)
# Laurie = SPEAKER_06
alicia_word_counts = count_words_in_dataset(alicia_dataset)
print(alicia_word_counts)

{'context_word_count': 762, 'response_word_count': 1705}


### Stacy

In [60]:
stacy_segments = np.load('transcripts/stacy.npy', allow_pickle=True)
stacy_dataset = build_dialogue_dataset(stacy_segments, target_speaker='SPEAKER_06')
np.save('datasets/stacy_dataset.npy', stacy_dataset)

stacy_word_counts = count_words_in_dataset(stacy_dataset)
print(stacy_word_counts)

FileNotFoundError: [Errno 2] No such file or directory: 'transcripts/stacy.npy'

### Jennifer

In [61]:
jennifer_segments = np.load('transcripts/jennifer.npy', allow_pickle=True)
jennifer_dataset = build_dialogue_dataset(jennifer_segments, target_speaker='SPEAKER_01')
np.save('datasets/jennifer_dataset.npy', jennifer_dataset)

# print_segments(jennifer_segments)
# jennifer = SPEAKER_01

jennifer_word_counts = count_words_in_dataset(jennifer_dataset)
print(jennifer_word_counts)

{'context_word_count': 941, 'response_word_count': 3157}


### Richard

In [62]:
richard_segments = np.load('transcripts/richard.npy', allow_pickle=True)
richard_dataset = build_dialogue_dataset(richard_segments, target_speaker='SPEAKER_06')
np.save('datasets/richard_dataset.npy', richard_dataset)

# print_segments(richard_segments)
# richardr = SPEAKER_01


richard_word_counts = count_words_in_dataset(richard_dataset)
print(richard_word_counts)

{'context_word_count': 3885, 'response_word_count': 172}


In [67]:
print_segments(richard_segments)

SPEAKER_06:  Okay, we're here with the school board president Jennifer Smith, and she's running for reelection.
SPEAKER_06: Thanks for being here, Jennifer, you want to just talk about your, your, your reason for running and your experience on after your first term.
SPEAKER_01:  Right, okay, thank you.
SPEAKER_01: Thank you for having me.
SPEAKER_01: Um, so I, um.
SPEAKER_01: I've been in the school district for over 15.
SPEAKER_01: I have a child who's 20, so they started in preschool at and I couldn't be great more grateful to have a preschool program that runs through the district because.
SPEAKER_01: And it really was, it's a wonderful program and not many school districts had a public preschool program at the time.
SPEAKER_01: Um, I.
SPEAKER_01:  And then I started when I started to learn about education in California and how funding worked in the district, I really got very passionate about public schools and public school funding and and wanting to participate and make it better

### Meta-df

In [65]:
datasets = {
    'alicia': alicia_dataset,
    'jennifer': jennifer_dataset,
    'richard': richard_dataset,
    #'stacy': stacy_dataset,
    'laurie': laurie_dataset,
    #'maria': maria_dataset,
    #'jon': jon_dataset
}

word_counts = {}
for name, dataset in datasets.items():
    word_counts[name] = count_words_in_dataset(dataset)

# Build list of metadata dicts for DataFrame
meta_info = []
for name, dataset in datasets.items():
    counts = word_counts[name]
    total_words = counts['context_word_count'] + counts['response_word_count']
    avg_words = total_words / len(dataset) if len(dataset) > 0 else 0
    meta_info.append({
        'name': name,
        'num_examples': len(dataset),
        'context_word_count': counts['context_word_count'],
        'response_word_count': counts['response_word_count'],
        'total_words': total_words,
        'avg_words_per_example': avg_words
    })

# Create DataFrame
df = pd.DataFrame(meta_info)