# Preliminary Data Annotation

We want positive/negative examples annotated with a series of linguistic metrics (coherence, fluency) both at the utterance level and at the dialogue level (< 5 turns). 

- Positive examples will be taken from the [BabyLM (Switchboard)](https://huggingface.co/datasets/hhoangphuoc/switchboard) dataset.
- Negative examples will be taken from BabyLlama outputs.

Corpus size: no more than 30M tokens.

# 1. Setup

In [117]:

import pandas as pd
import os
import re
import json
import numpy as np

## 2. Data Processing

### 2.1 BabyLM (Switchboard) Dataset

In [None]:
# Switchboard Data Processing for Lexical Metrics


# === Load and Clean Data ===
with open("/home/rooein/babylm-interaction/baseline/data/train_100M/switchboard.train", "r") as f:
    lines = [line.strip() for line in f if line.strip() and not line.startswith("----")]

# Parse speaker and text
data = []
for line in lines:
    if "\t" in line:
        speaker, text = line.split("\t", 1)
        data.append((speaker.strip(), text.strip()))

# Create DataFrame
df = pd.DataFrame(data, columns=["speaker", "text"])

# === Dialog-Level Chunking (5-turn blocks like ABABA or BABAB) ===
dialogs = []
current_dialog = []
last_speaker = None

for speaker, text in zip(df["speaker"], df["text"]):
    if not current_dialog:
        current_dialog.append([speaker, [text]])
    elif speaker == last_speaker:
        current_dialog[-1][1].append(text)
    else:
        current_dialog.append([speaker, [text]])

    last_speaker = speaker

    if len(current_dialog) == 5:
        dialog_text = [f"{turn[0]}: {' '.join(turn[1])}" for turn in current_dialog]
        dialogs.append(dialog_text)
        current_dialog = []
        last_speaker = None

# Convert to DataFrame
dialog_df = pd.DataFrame(dialogs, columns=[f"turn_{i+1}" for i in range(5)])

# === Turn-Level Extraction ===
turns = []
for dialog in dialogs:
    for turn in dialog:
        turns.append(turn)

turn_df = pd.DataFrame(turns, columns=["turn_text"])

# === Speaker-Level Aggregation ===
speaker_chunks = []

for dialog in dialogs:
    a_text, b_text = [], []
    for turn_text in dialog:
        if turn_text.startswith("A:"):
            a_text.append(turn_text[2:].strip())
        elif turn_text.startswith("B:"):
            b_text.append(turn_text[2:].strip())
    speaker_chunks.append({
        "A_text": " ".join(a_text),
        "B_text": " ".join(b_text),
    })

speaker_df = pd.DataFrame(speaker_chunks)

# === Save DataFrames ===
dialog_df.to_csv("switchboard_dialog_level.csv", index=False)
# turn_df.to_csv("switchboard_turn_level.csv", index=False)
# speaker_df.to_csv("switchboard_speaker_level.csv", index=False)

print("Saved dialog level files.")

# === Export Each Row to Text Files ===
os.makedirs("./switchboard/turn_level_texts/A", exist_ok=True)
os.makedirs("./switchboard/turn_level_texts/B", exist_ok=True)

for i, row in enumerate(turn_df["turn_text"]):
    speaker = row.strip()[0] if row.strip() else "Unknown"
    if speaker in ["A", "B"]:
        with open(f"./switchboard/turn_level_texts/{speaker}/{speaker}_{i:05d}.txt", "w") as f:
            f.write(row)

os.makedirs("./switchboard/speaker_level_texts/A", exist_ok=True)
os.makedirs("./switchboard/speaker_level_texts/B", exist_ok=True)

for i, row in speaker_df.iterrows():
    with open(f"./switchboard/speaker_level_texts/A/A_{i:05d}.txt", "w") as fa:
        fa.write(row["A_text"])
    with open(f"./switchboard/speaker_level_texts/B/B_{i:05d}.txt", "w") as fb:
        fb.write(row["B_text"])

# prepare text inputs based on dialogs
os.makedirs("./switchboard/dialog_level_texts", exist_ok=True)

for i, row in dialog_df.iterrows():
    full_dialog = "\n".join(row.values)
    with open(f"./switchboard/dialog_level_texts/dialog_{i:05d}.txt", "w") as f:
        f.write(full_dialog)

print("Exported text files to ./switchboard/turn_level_texts/, ./switchboard/speaker_level_texts/, and ./switchboard/dialog_level_texts/")


FileNotFoundError: [Errno 2] No such file or directory: '/home/rooein/babylm-interaction/baseline/data/train_100M/switchboard.train'

## 3. Metrics

We discussed Fluency and Coherence as the two important things we want to annotate.

## 3.1 Cohesion Metrics

### 3.1.1 Loading TAACO Outputs 

TAACO accepts inputs as `.txt` files. To prepare inputs for TAACO, we first chunk the dataset into individual turns using the `switchboard_data`, and save each dialog separately as `.txt` files.

Next, we run TAACO metrics on directory of text files by executing `test_donya.py` located in the TAACO directory. `test_donya.py` reads a list of directory of text files from `taaco_config.json` and TAACO outputs are then saved to the directory `switchboard_taaco_results`.



### 3.1.2 Filtering TAACO Outputs 
We subsequently filter the TAACO outputs, retaining only the following selected metrics:

- `noun_ttr`
- `verb_ttr`
- `adj_ttr`
- `lemma_ttr`
- `bigram_lemma_ttr`
- `trigram_lemma_ttr`
- `adjacent_overlap_all_sent`
- `lda_1_all_sent`
- `repeated_content_lemmas`
- `repeated_content_and_pronoun_lemmas`




In [30]:
import pandas as pd
import os
import re

# Paths
csv_file_path = '/home/rooein/babylm-interaction/baseline/data/switchboard_taaco_results/dialog_level_taaco_results.csv'
texts_folder_path = '/home/rooein/babylm-interaction/baseline/data/switchboard_data/dialog_level_texts'
output_csv_path = '/home/rooein/babylm-interaction/baseline/data/switchboard_taaco_results/dialog_level_taaco_results_filtered.csv'

# Step 1: Load CSV
metrics_df = pd.read_csv(csv_file_path)

# Step 2: Load and map text files to filenames
def load_text(filename, folder):
    filepath = os.path.join(folder, filename)
    with open(filepath, 'r', encoding='utf-8') as file:
        return file.read().strip()

# Add text column based on Filename
metrics_df['text'] = metrics_df['Filename'].apply(lambda fname: load_text(fname, texts_folder_path))

# Step 3: Compute number of words without A:: and B:: in the texts and rank texts
def clean_text(text):
    return re.sub(r'\b[AB]::', '', text)

metrics_df['cleaned_text'] = metrics_df['text'].apply(clean_text)
metrics_df['num_words'] = metrics_df['cleaned_text'].apply(lambda x: len(x.split()))

print(f"Total tokens before ranking: {metrics_df['num_words'].sum()}")

metrics_df.sort_values(by='num_words', ascending=False, inplace=True)

# Step 4: Select data until reaching approximately 30 million tokens
max_tokens = 30_000_000
token_count = 0
selected_rows = []

for idx, row in metrics_df.iterrows():
    if token_count + row['num_words'] <= max_tokens:
        selected_rows.append(row)
        token_count += row['num_words']
    else:
        break

selected_df = pd.DataFrame(selected_rows)

print(f"Total words after ranking and selection: {selected_df['num_words'].sum()}")

# Step 5: Save the final DataFrame
selected_df.to_csv(output_csv_path, index=False)

# Display the first few rows
print(selected_df.head())


Total tokens before ranking: 1180289
Total words after ranking and selection: 1180289
               Filename  lemma_ttr  lemma_mattr  lexical_density_tokens  \
2341   dialog_02341.txt   0.332795     0.670491                0.387722   
2633   dialog_02633.txt   0.336299     0.726121                0.450178   
8707   dialog_08707.txt   0.321839     0.706173                0.421456   
12658  dialog_12658.txt   0.342697     0.747258                0.430712   
283    dialog_00283.txt   0.355509     0.727546                0.432432   

       lexical_density_types  content_ttr  function_ttr  function_mattr  \
2341                0.728155     0.625000      0.171504        0.462970   
2633                0.761905     0.569170      0.190939        0.470385   
8707                0.720238     0.550000      0.192053        0.497312   
12658               0.666667     0.530435      0.233553        0.542196   
283                 0.736842     0.605769      0.190476        0.480000   

       noun_

In [31]:

# Define TAACO metrics columns
taaco_cols = [
    'noun_ttr', 'verb_ttr', 'adj_ttr', 'lemma_ttr',
    'bigram_lemma_ttr', 'trigram_lemma_ttr', 'adjacent_overlap_all_sent',
    'lda_1_all_sent', 'repeated_content_lemmas',
    'repeated_content_and_pronoun_lemmas'
]

# Create new column with TAACO metrics as a dictionary
selected_df['TAACO_metrics_dialog'] = selected_df[taaco_cols].to_dict(orient='records')

# Keep only the desired columns
df_reformatted = selected_df[['Filename', 'text', 'num_words', 'TAACO_metrics_dialog']]

# Display or save the new DataFrame
print(df_reformatted.head())


               Filename                                               text  \
2341   dialog_02341.txt  B:: And nothing is being done about it. Uh, th...   
2633   dialog_02633.txt  B:: I don't know if any of mine will be intere...   
8707   dialog_08707.txt  A:: I put a stop to some of them as far as the...   
12658  dialog_12658.txt  A:: You know, my neighbors across the street, ...   
283    dialog_00283.txt  A:: Yeah. Have you, do you use a standard, uh,...   

       num_words                               TAACO_metrics_dialog  
2341         593  {'noun_ttr': 0.71, 'verb_ttr': 0.475, 'adj_ttr...  
2633         542  {'noun_ttr': 0.6597938144329897, 'verb_ttr': 0...  
8707         492  {'noun_ttr': 0.68, 'verb_ttr': 0.4634146341463...  
12658        489  {'noun_ttr': 0.6216216216216216, 'verb_ttr': 0...  
283          460  {'noun_ttr': 0.7088607594936709, 'verb_ttr': 0...  


In [32]:
df_reformatted

Unnamed: 0,Filename,text,num_words,TAACO_metrics_dialog
2341,dialog_02341.txt,"B:: And nothing is being done about it. Uh, th...",593,"{'noun_ttr': 0.71, 'verb_ttr': 0.475, 'adj_ttr..."
2633,dialog_02633.txt,B:: I don't know if any of mine will be intere...,542,"{'noun_ttr': 0.6597938144329897, 'verb_ttr': 0..."
8707,dialog_08707.txt,A:: I put a stop to some of them as far as the...,492,"{'noun_ttr': 0.68, 'verb_ttr': 0.4634146341463..."
12658,dialog_12658.txt,"A:: You know, my neighbors across the street, ...",489,"{'noun_ttr': 0.6216216216216216, 'verb_ttr': 0..."
283,dialog_00283.txt,"A:: Yeah. Have you, do you use a standard, uh,...",460,"{'noun_ttr': 0.7088607594936709, 'verb_ttr': 0..."
...,...,...,...,...
1093,dialog_01093.txt,"A:: Yeah.\nB:: and,\nA:: Uh-huh.\nB:: I don't ...",7,"{'noun_ttr': 0.4, 'verb_ttr': 1.0, 'adj_ttr': ..."
10672,dialog_10672.txt,B:: Uh-huh.\nA:: Small island.\nB:: Yeah.\nA::...,6,"{'noun_ttr': 0.5, 'verb_ttr': 0.0, 'adj_ttr': ..."
6125,dialog_06125.txt,A:: Thanks.\nB:: Thank you.\nA:: Bye.\nB:: Goo...,6,"{'noun_ttr': 0.5, 'verb_ttr': 1.0, 'adj_ttr': ..."
13613,dialog_13613.txt,A:: Okay.\nB:: Thanks.\nA:: Thank you.\nB:: By...,6,"{'noun_ttr': 0.4444444444444444, 'verb_ttr': 1..."


In [33]:
# Step 1: Load speaker-level TAACO results
speaker_A_df = pd.read_csv('/home/rooein/babylm-interaction/baseline/data/switchboard_taaco_results/speaker_level_A_taaco_results.csv')
speaker_B_df = pd.read_csv('/home/rooein/babylm-interaction/baseline/data/switchboard_taaco_results/speaker_level_B_taaco_results.csv')

# Step 2: Extract base filename from 'Filename'
speaker_A_df['BaseFilename'] = speaker_A_df['Filename'].apply(lambda x: x.replace('A_', ''))
speaker_B_df['BaseFilename'] = speaker_B_df['Filename'].apply(lambda x: x.replace('B_', ''))
df_reformatted['BaseFilename'] = df_reformatted['Filename'].str.extract(r'(\d+\.txt)$')

# Step 3: Define TAACO metrics columns
taaco_cols = [
    'noun_ttr', 'verb_ttr', 'adj_ttr', 'lemma_ttr',
    'bigram_lemma_ttr', 'trigram_lemma_ttr', 'adjacent_overlap_all_sent',
    'lda_1_all_sent', 'repeated_content_lemmas',
    'repeated_content_and_pronoun_lemmas'
]

# Step 4: Create dictionaries of metrics
speaker_A_df['TAACO_metrics_speaker_A'] = speaker_A_df[taaco_cols].to_dict(orient='records')
speaker_B_df['TAACO_metrics_speaker_B'] = speaker_B_df[taaco_cols].to_dict(orient='records')

# Step 5: Merge back to main df
df_reformatted = df_reformatted.merge(
    speaker_A_df[['BaseFilename', 'TAACO_metrics_speaker_A']],
    on='BaseFilename', how='left'
)

df_reformatted = df_reformatted.merge(
    speaker_B_df[['BaseFilename', 'TAACO_metrics_speaker_B']],
    on='BaseFilename', how='left'
)

# Step 6: Drop helper column
df_reformatted.drop(columns=['BaseFilename'], inplace=True)

# Step 7: Save updated dataframe
df_reformatted.to_csv('/home/rooein/babylm-interaction/baseline/data/switchboard_taaco_results/dialog_level_taaco_results_filtered_with_speakers.csv', index=False)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reformatted['BaseFilename'] = df_reformatted['Filename'].str.extract(r'(\d+\.txt)$')


In [34]:
df_reformatted

Unnamed: 0,Filename,text,num_words,TAACO_metrics_dialog,TAACO_metrics_speaker_A,TAACO_metrics_speaker_B
0,dialog_02341.txt,"B:: And nothing is being done about it. Uh, th...",593,"{'noun_ttr': 0.71, 'verb_ttr': 0.475, 'adj_ttr...","{'noun_ttr': 0.8166666666666667, 'verb_ttr': 0...","{'noun_ttr': 0.8857142857142857, 'verb_ttr': 0..."
1,dialog_02633.txt,B:: I don't know if any of mine will be intere...,542,"{'noun_ttr': 0.6597938144329897, 'verb_ttr': 0...","{'noun_ttr': 0.7246376811594203, 'verb_ttr': 0...","{'noun_ttr': 0.8695652173913043, 'verb_ttr': 0..."
2,dialog_08707.txt,A:: I put a stop to some of them as far as the...,492,"{'noun_ttr': 0.68, 'verb_ttr': 0.4634146341463...","{'noun_ttr': 0.8695652173913043, 'verb_ttr': 0...","{'noun_ttr': 0.7446808510638298, 'verb_ttr': 0..."
3,dialog_12658.txt,"A:: You know, my neighbors across the street, ...",489,"{'noun_ttr': 0.6216216216216216, 'verb_ttr': 0...","{'noun_ttr': 0.6376811594202898, 'verb_ttr': 0...","{'noun_ttr': 0.0, 'verb_ttr': 1.0, 'adj_ttr': ..."
4,dialog_00283.txt,"A:: Yeah. Have you, do you use a standard, uh,...",460,"{'noun_ttr': 0.7088607594936709, 'verb_ttr': 0...","{'noun_ttr': 0.8235294117647058, 'verb_ttr': 0...","{'noun_ttr': 0.775, 'verb_ttr': 0.479166666666..."
...,...,...,...,...,...,...
17888,dialog_01093.txt,"A:: Yeah.\nB:: and,\nA:: Uh-huh.\nB:: I don't ...",7,"{'noun_ttr': 0.4, 'verb_ttr': 1.0, 'adj_ttr': ...","{'noun_ttr': 0.0, 'verb_ttr': 0.0, 'adj_ttr': ...","{'noun_ttr': 0.0, 'verb_ttr': 1.0, 'adj_ttr': ..."
17889,dialog_10672.txt,B:: Uh-huh.\nA:: Small island.\nB:: Yeah.\nA::...,6,"{'noun_ttr': 0.5, 'verb_ttr': 0.0, 'adj_ttr': ...","{'noun_ttr': 1.0, 'verb_ttr': 0.0, 'adj_ttr': ...","{'noun_ttr': 0.0, 'verb_ttr': 0.0, 'adj_ttr': ..."
17890,dialog_06125.txt,A:: Thanks.\nB:: Thank you.\nA:: Bye.\nB:: Goo...,6,"{'noun_ttr': 0.5, 'verb_ttr': 1.0, 'adj_ttr': ...","{'noun_ttr': 1.0, 'verb_ttr': 0.0, 'adj_ttr': ...","{'noun_ttr': 1.0, 'verb_ttr': 1.0, 'adj_ttr': ..."
17891,dialog_13613.txt,A:: Okay.\nB:: Thanks.\nA:: Thank you.\nB:: By...,6,"{'noun_ttr': 0.4444444444444444, 'verb_ttr': 1...","{'noun_ttr': 1.0, 'verb_ttr': 1.0, 'adj_ttr': ...","{'noun_ttr': 0.6666666666666666, 'verb_ttr': 0..."


## 3.2 Vocabulary Metrics

### 3.2.1 Average Age of Acquisition of Words Used

In [148]:
aoa_data = pd.read_csv("kuperman_2012_aoa_dataset.csv")

In [149]:
len(aoa_data)

2614

In [150]:
mini_aoa = min(list(aoa_data.age_of_acquisition))
print(mini_aoa)
maxi_aoa = max(list(aoa_data.age_of_acquisition))
print(maxi_aoa)

2.310598
17.44


In [151]:
data_to_annotate = pd.read_csv("switchboard_taaco_results/dialog_level_taaco_results_filtered_with_speakers.csv")

In [152]:
def get_aoa_of_word(word):
    try:
        ind = list(aoa_data.word).index(word)
        aoa = list(aoa_data.age_of_acquisition)[ind]
        return aoa
    except ValueError:
        return None


In [153]:
names = ["AOA_dialog", "AOA_speaker_A", "AOA_speaker_B"]
for name in names:
    data_to_annotate[name] = None

In [154]:
for index, row in data_to_annotate.iterrows():
    lines = row.text.split('\n')
    speaker_A = ''
    speaker_B = ''
    for l in lines:
        if 'A:: ' in l:
            speaker_A += l + ' '
        elif 'B:: ' in l:
            speaker_B += l + ' '
        else:
            raise ValueError()

    # remove non-alphanumeric characters (syntax) except whitespaces
    dialog = ''.join(char for char in row.text.replace('\n', ' ') if char.isalnum() or char == ' ')
    speaker_A = ''.join(char for char in speaker_A.strip() if char.isalnum() or char == ' ')
    speaker_B = ''.join(char for char in speaker_B.strip() if char.isalnum() or char == ' ')
   
    # for each data split
    splits = [dialog, speaker_A, speaker_B]

    ind = 0
    for s in splits:
        words = s.split(" ") # split text at whitespaces

        sum = 0
        count = 0
        for word in words:
            aoa = get_aoa_of_word(word) 
            if aoa is not None:
                sum+= aoa
                count+= 1
        
        if sum > 0:
            vocabulary = sum / count # average across all words with AOA data
            # normalise by min-max
            vocabulary = (vocabulary - mini_aoa)/(maxi_aoa - mini_aoa)
        else: 
            vocabulary = None

        data_to_annotate.loc[index, names[ind]] = vocabulary
        ind += 1

KeyboardInterrupt: 

In [None]:
data_to_annotate

Unnamed: 0,Filename,text,num_words,TAACO_metrics_dialog,TAACO_metrics_speaker_A,TAACO_metrics_speaker_B,AOA_dialog,AOA_speaker_A,AOA_speaker_B
0,dialog_02341.txt,"B:: And nothing is being done about it. Uh, th...",593,"{'noun_ttr': 0.71, 'verb_ttr': 0.475, 'adj_ttr...","{'noun_ttr': 0.8166666666666667, 'verb_ttr': 0...","{'noun_ttr': 0.8857142857142857, 'verb_ttr': 0...",0.162852,0.154903,0.182672
1,dialog_02633.txt,B:: I don't know if any of mine will be intere...,542,"{'noun_ttr': 0.6597938144329897, 'verb_ttr': 0...","{'noun_ttr': 0.7246376811594203, 'verb_ttr': 0...","{'noun_ttr': 0.8695652173913043, 'verb_ttr': 0...",0.145935,0.147803,0.140744
2,dialog_08707.txt,A:: I put a stop to some of them as far as the...,492,"{'noun_ttr': 0.68, 'verb_ttr': 0.4634146341463...","{'noun_ttr': 0.8695652173913043, 'verb_ttr': 0...","{'noun_ttr': 0.7446808510638298, 'verb_ttr': 0...",0.129887,0.135843,0.128013
3,dialog_12658.txt,"A:: You know, my neighbors across the street, ...",489,"{'noun_ttr': 0.6216216216216216, 'verb_ttr': 0...","{'noun_ttr': 0.6376811594202898, 'verb_ttr': 0...","{'noun_ttr': 0.0, 'verb_ttr': 1.0, 'adj_ttr': ...",0.133228,0.133575,0.088205
4,dialog_00283.txt,"A:: Yeah. Have you, do you use a standard, uh,...",460,"{'noun_ttr': 0.7088607594936709, 'verb_ttr': 0...","{'noun_ttr': 0.8235294117647058, 'verb_ttr': 0...","{'noun_ttr': 0.775, 'verb_ttr': 0.479166666666...",0.158483,0.163534,0.155163
...,...,...,...,...,...,...,...,...,...
17888,dialog_01093.txt,"A:: Yeah.\nB:: and,\nA:: Uh-huh.\nB:: I don't ...",7,"{'noun_ttr': 0.4, 'verb_ttr': 1.0, 'adj_ttr': ...","{'noun_ttr': 0.0, 'verb_ttr': 0.0, 'adj_ttr': ...","{'noun_ttr': 0.0, 'verb_ttr': 1.0, 'adj_ttr': ...",0.10858,,0.10858
17889,dialog_10672.txt,B:: Uh-huh.\nA:: Small island.\nB:: Yeah.\nA::...,6,"{'noun_ttr': 0.5, 'verb_ttr': 0.0, 'adj_ttr': ...","{'noun_ttr': 1.0, 'verb_ttr': 0.0, 'adj_ttr': ...","{'noun_ttr': 0.0, 'verb_ttr': 0.0, 'adj_ttr': ...",0.337052,0.337052,
17890,dialog_06125.txt,A:: Thanks.\nB:: Thank you.\nA:: Bye.\nB:: Goo...,6,"{'noun_ttr': 0.5, 'verb_ttr': 1.0, 'adj_ttr': ...","{'noun_ttr': 1.0, 'verb_ttr': 0.0, 'adj_ttr': ...","{'noun_ttr': 1.0, 'verb_ttr': 1.0, 'adj_ttr': ...",0.134538,,0.134538
17891,dialog_13613.txt,A:: Okay.\nB:: Thanks.\nA:: Thank you.\nB:: By...,6,"{'noun_ttr': 0.4444444444444444, 'verb_ttr': 1...","{'noun_ttr': 1.0, 'verb_ttr': 1.0, 'adj_ttr': ...","{'noun_ttr': 0.6666666666666666, 'verb_ttr': 0...",0.134538,0.134538,


In [None]:
data_to_annotate.to_csv('switchboard_taaco_results/taaco_vocab_metrics_results_filtered_with_speakers.csv')

### 3.1.3 Aggregating TAACO metrics

In [206]:
data_to_annotate = pd.read_csv('switchboard_taaco_results/taaco_vocab_metrics_results_filtered_with_speakers.csv')

In [207]:
df = data_to_annotate

In [208]:
keys = list(json.loads(df["TAACO_metrics_dialog"][0].replace("\'", "\"")).keys())
metrics = {k: [] for k in keys}

for row in df["TAACO_metrics_dialog"]:
    row = row.replace("\'", "\"")
    row = json.loads(row)

    for k in row.keys():
        metrics[k].append(row[k])

In [209]:
means = {k: np.mean(metrics[k]) for k in keys}
stds = {k: np.std(metrics[k]) for k in keys}

In [210]:
normal_metrics = {k: [] for k in keys}

for key in keys:
    for i in range(len(metrics[key])):
        normal_metrics[key].append((metrics[key][i] - means[key]) / stds[key])

In [211]:
len(metrics['verb_ttr'])

17893

In [212]:
averages = []
for i in range(len(normal_metrics['verb_ttr'])):
    sum = 0
    for key in keys:
        sum += normal_metrics[key][i]
    averages.append(sum/10)

averages


[np.float64(-0.44771617560443666),
 np.float64(-0.22822012194868652),
 np.float64(-0.44039815400961313),
 np.float64(-0.3002578851714043),
 np.float64(-0.1729446602726053),
 np.float64(-0.6857689375534232),
 np.float64(-0.31690883494110056),
 np.float64(-0.5251941283950808),
 np.float64(-0.5438977998833264),
 np.float64(-0.30174170074842477),
 np.float64(-0.2984306413191833),
 np.float64(-0.4564398848310172),
 np.float64(-0.04966119752318789),
 np.float64(-0.4347083936482982),
 np.float64(-0.354044836711511),
 np.float64(-0.11723473053380787),
 np.float64(-0.2347073903235703),
 np.float64(-0.4006515960518639),
 np.float64(-0.1398238282707362),
 np.float64(-0.26892068170704764),
 np.float64(-0.40253040126397527),
 np.float64(-0.036055629692500965),
 np.float64(-0.1496549839682228),
 np.float64(-0.06768983300461029),
 np.float64(-0.3067658375457309),
 np.float64(-0.29569616435536733),
 np.float64(-0.29087429093770956),
 np.float64(-0.06199419905204939),
 np.float64(-0.47726390951921227),

In [213]:
data_to_annotate["TAACO_agg_dialog"] = averages

In [214]:
data_to_annotate["TAACO_agg_dialog"] 

0       -0.447716
1       -0.228220
2       -0.440398
3       -0.300258
4       -0.172945
           ...   
17888   -0.027490
17889   -0.212754
17890    1.724400
17891    1.262220
17892    0.087405
Name: TAACO_agg_dialog, Length: 17893, dtype: float64

In [215]:
keys = list(json.loads(df["TAACO_metrics_speaker_A"][0].replace("\'", "\"")).keys())
metrics = {k: [] for k in keys}

for row in df["TAACO_metrics_speaker_A"]:
    row = row.replace("\'", "\"")
    row = json.loads(row)

    for k in row.keys():
        metrics[k].append(row[k])

In [216]:
means = {k: np.mean(metrics[k]) for k in keys}
stds = {k: np.std(metrics[k]) for k in keys}

In [217]:
normal_metrics = {k: [] for k in keys}

for key in keys:
    for i in range(len(metrics[key])):
        normal_metrics[key].append((metrics[key][i] - means[key]) / stds[key])

In [218]:
averages = []
for i in range(len(normal_metrics['verb_ttr'])):
    sum = 0
    for key in keys:
        sum += normal_metrics[key][i]
    averages.append(sum/10)

averages


[np.float64(-0.01195371339438489),
 np.float64(0.16408481819857415),
 np.float64(0.3412218224171147),
 np.float64(0.16559946930583697),
 np.float64(0.24453875922209978),
 np.float64(0.2680384007131812),
 np.float64(0.19378388849934552),
 np.float64(0.06203335480254548),
 np.float64(0.04816431474707441),
 np.float64(0.08574849457616947),
 np.float64(0.13152061557198952),
 np.float64(0.1411802722528262),
 np.float64(0.23905906676819572),
 np.float64(0.0973134995302238),
 np.float64(0.2133511065536024),
 np.float64(0.26980799462047217),
 np.float64(0.33732126100788046),
 np.float64(0.029353398734531543),
 np.float64(0.21549235744680179),
 np.float64(0.24449036187468215),
 np.float64(0.17872280086234632),
 np.float64(0.16882588726391543),
 np.float64(0.275745952459887),
 np.float64(0.3476193329473495),
 np.float64(-0.4560905469698687),
 np.float64(0.17991025242568887),
 np.float64(0.15328040154830574),
 np.float64(0.1655749682270363),
 np.float64(0.29110509811228985),
 np.float64(-0.712635

In [219]:
data_to_annotate["TAACO_agg_speaker_A"] = averages

In [220]:
data_to_annotate["TAACO_agg_speaker_A"]

0       -0.011954
1        0.164085
2        0.341222
3        0.165599
4        0.244539
           ...   
17888   -0.838380
17889   -0.059267
17890   -0.442116
17891   -0.175444
17892   -0.456091
Name: TAACO_agg_speaker_A, Length: 17893, dtype: float64

In [221]:
keys = list(json.loads(df["TAACO_metrics_speaker_B"][0].replace("\'", "\"")).keys())
metrics = {k: [] for k in keys}

for row in df["TAACO_metrics_speaker_B"]:
    row = row.replace("\'", "\"")
    row = json.loads(row)

    for k in row.keys():
        metrics[k].append(row[k])

In [222]:
means = {k: np.mean(metrics[k]) for k in keys}
stds = {k: np.std(metrics[k]) for k in keys}

In [223]:
normal_metrics = {k: [] for k in keys}

for key in keys:
    for i in range(len(metrics[key])):
        normal_metrics[key].append((metrics[key][i] - means[key]) / stds[key])

In [224]:
averages = []
for i in range(len(normal_metrics['verb_ttr'])):
    sum = 0
    for key in keys:
        sum += normal_metrics[key][i]
    averages.append(sum/10)

averages

[np.float64(0.1935774606324461),
 np.float64(0.260929657879658),
 np.float64(0.09724417434165504),
 np.float64(-0.20129191039098537),
 np.float64(0.21530960133600577),
 np.float64(-0.010150832385619247),
 np.float64(0.2730093276591477),
 np.float64(0.29582197464939497),
 np.float64(0.2412711247004391),
 np.float64(0.052256182541865946),
 np.float64(0.17756817529621668),
 np.float64(0.04769015172032707),
 np.float64(0.4908051783019978),
 np.float64(0.2981725652761064),
 np.float64(0.04072496593445486),
 np.float64(0.30070924826920065),
 np.float64(0.12705994946534543),
 np.float64(0.029276752809378293),
 np.float64(0.21824624957747218),
 np.float64(0.15144013488662292),
 np.float64(0.08297216489592232),
 np.float64(0.5011453035898179),
 np.float64(0.23423823838743033),
 np.float64(0.2110067891978759),
 np.float64(0.12152333827237753),
 np.float64(0.1730192762324057),
 np.float64(0.4217882123962456),
 np.float64(0.2696775561350172),
 np.float64(0.06279636713841766),
 np.float64(0.1689660

In [225]:
data_to_annotate["TAACO_agg_speaker_B"] = averages

In [226]:
data_to_annotate

Unnamed: 0.1,Unnamed: 0,Filename,text,num_words,TAACO_metrics_dialog,TAACO_metrics_speaker_A,TAACO_metrics_speaker_B,AOA_dialog,AOA_speaker_A,AOA_speaker_B,TAACO_agg_dialog,TAACO_agg_speaker_A,TAACO_agg_speaker_B
0,0,dialog_02341.txt,"B:: And nothing is being done about it. Uh, th...",593,"{'noun_ttr': 0.71, 'verb_ttr': 0.475, 'adj_ttr...","{'noun_ttr': 0.8166666666666667, 'verb_ttr': 0...","{'noun_ttr': 0.8857142857142857, 'verb_ttr': 0...",0.162852,0.154903,0.182672,-0.447716,-0.011954,0.193577
1,1,dialog_02633.txt,B:: I don't know if any of mine will be intere...,542,"{'noun_ttr': 0.6597938144329897, 'verb_ttr': 0...","{'noun_ttr': 0.7246376811594203, 'verb_ttr': 0...","{'noun_ttr': 0.8695652173913043, 'verb_ttr': 0...",0.145935,0.147803,0.140744,-0.228220,0.164085,0.260930
2,2,dialog_08707.txt,A:: I put a stop to some of them as far as the...,492,"{'noun_ttr': 0.68, 'verb_ttr': 0.4634146341463...","{'noun_ttr': 0.8695652173913043, 'verb_ttr': 0...","{'noun_ttr': 0.7446808510638298, 'verb_ttr': 0...",0.129887,0.135843,0.128013,-0.440398,0.341222,0.097244
3,3,dialog_12658.txt,"A:: You know, my neighbors across the street, ...",489,"{'noun_ttr': 0.6216216216216216, 'verb_ttr': 0...","{'noun_ttr': 0.6376811594202898, 'verb_ttr': 0...","{'noun_ttr': 0.0, 'verb_ttr': 1.0, 'adj_ttr': ...",0.133228,0.133575,0.088205,-0.300258,0.165599,-0.201292
4,4,dialog_00283.txt,"A:: Yeah. Have you, do you use a standard, uh,...",460,"{'noun_ttr': 0.7088607594936709, 'verb_ttr': 0...","{'noun_ttr': 0.8235294117647058, 'verb_ttr': 0...","{'noun_ttr': 0.775, 'verb_ttr': 0.479166666666...",0.158483,0.163534,0.155163,-0.172945,0.244539,0.215310
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17888,17888,dialog_01093.txt,"A:: Yeah.\nB:: and,\nA:: Uh-huh.\nB:: I don't ...",7,"{'noun_ttr': 0.4, 'verb_ttr': 1.0, 'adj_ttr': ...","{'noun_ttr': 0.0, 'verb_ttr': 0.0, 'adj_ttr': ...","{'noun_ttr': 0.0, 'verb_ttr': 1.0, 'adj_ttr': ...",0.108580,,0.108580,-0.027490,-0.838380,-0.427282
17889,17889,dialog_10672.txt,B:: Uh-huh.\nA:: Small island.\nB:: Yeah.\nA::...,6,"{'noun_ttr': 0.5, 'verb_ttr': 0.0, 'adj_ttr': ...","{'noun_ttr': 1.0, 'verb_ttr': 0.0, 'adj_ttr': ...","{'noun_ttr': 0.0, 'verb_ttr': 0.0, 'adj_ttr': ...",0.337052,0.337052,,-0.212754,-0.059267,-0.441778
17890,17890,dialog_06125.txt,A:: Thanks.\nB:: Thank you.\nA:: Bye.\nB:: Goo...,6,"{'noun_ttr': 0.5, 'verb_ttr': 1.0, 'adj_ttr': ...","{'noun_ttr': 1.0, 'verb_ttr': 0.0, 'adj_ttr': ...","{'noun_ttr': 1.0, 'verb_ttr': 1.0, 'adj_ttr': ...",0.134538,,0.134538,1.724400,-0.442116,0.220894
17891,17891,dialog_13613.txt,A:: Okay.\nB:: Thanks.\nA:: Thank you.\nB:: By...,6,"{'noun_ttr': 0.4444444444444444, 'verb_ttr': 1...","{'noun_ttr': 1.0, 'verb_ttr': 1.0, 'adj_ttr': ...","{'noun_ttr': 0.6666666666666666, 'verb_ttr': 0...",0.134538,0.134538,,1.262220,-0.175444,0.370136


In [227]:
#define function to swap columns
def swap_columns(df, col1, col2):
    col_list = list(df.columns)
    x, y = col_list.index(col1), col_list.index(col2)
    col_list[y], col_list[x] = col_list[x], col_list[y]
    df = df[col_list]
    return df

In [228]:
#swap points and rebounds columns
data_to_annotate = swap_columns(data_to_annotate, 'TAACO_metrics_dialog', 'AOA_dialog')


In [229]:
data_to_annotate = swap_columns(data_to_annotate, 'TAACO_metrics_speaker_A', 'AOA_speaker_A')
data_to_annotate = swap_columns(data_to_annotate, 'TAACO_metrics_speaker_B', 'AOA_speaker_B')

In [230]:
data_to_annotate

Unnamed: 0.1,Unnamed: 0,Filename,text,num_words,AOA_dialog,AOA_speaker_A,AOA_speaker_B,TAACO_metrics_dialog,TAACO_metrics_speaker_A,TAACO_metrics_speaker_B,TAACO_agg_dialog,TAACO_agg_speaker_A,TAACO_agg_speaker_B
0,0,dialog_02341.txt,"B:: And nothing is being done about it. Uh, th...",593,0.162852,0.154903,0.182672,"{'noun_ttr': 0.71, 'verb_ttr': 0.475, 'adj_ttr...","{'noun_ttr': 0.8166666666666667, 'verb_ttr': 0...","{'noun_ttr': 0.8857142857142857, 'verb_ttr': 0...",-0.447716,-0.011954,0.193577
1,1,dialog_02633.txt,B:: I don't know if any of mine will be intere...,542,0.145935,0.147803,0.140744,"{'noun_ttr': 0.6597938144329897, 'verb_ttr': 0...","{'noun_ttr': 0.7246376811594203, 'verb_ttr': 0...","{'noun_ttr': 0.8695652173913043, 'verb_ttr': 0...",-0.228220,0.164085,0.260930
2,2,dialog_08707.txt,A:: I put a stop to some of them as far as the...,492,0.129887,0.135843,0.128013,"{'noun_ttr': 0.68, 'verb_ttr': 0.4634146341463...","{'noun_ttr': 0.8695652173913043, 'verb_ttr': 0...","{'noun_ttr': 0.7446808510638298, 'verb_ttr': 0...",-0.440398,0.341222,0.097244
3,3,dialog_12658.txt,"A:: You know, my neighbors across the street, ...",489,0.133228,0.133575,0.088205,"{'noun_ttr': 0.6216216216216216, 'verb_ttr': 0...","{'noun_ttr': 0.6376811594202898, 'verb_ttr': 0...","{'noun_ttr': 0.0, 'verb_ttr': 1.0, 'adj_ttr': ...",-0.300258,0.165599,-0.201292
4,4,dialog_00283.txt,"A:: Yeah. Have you, do you use a standard, uh,...",460,0.158483,0.163534,0.155163,"{'noun_ttr': 0.7088607594936709, 'verb_ttr': 0...","{'noun_ttr': 0.8235294117647058, 'verb_ttr': 0...","{'noun_ttr': 0.775, 'verb_ttr': 0.479166666666...",-0.172945,0.244539,0.215310
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17888,17888,dialog_01093.txt,"A:: Yeah.\nB:: and,\nA:: Uh-huh.\nB:: I don't ...",7,0.108580,,0.108580,"{'noun_ttr': 0.4, 'verb_ttr': 1.0, 'adj_ttr': ...","{'noun_ttr': 0.0, 'verb_ttr': 0.0, 'adj_ttr': ...","{'noun_ttr': 0.0, 'verb_ttr': 1.0, 'adj_ttr': ...",-0.027490,-0.838380,-0.427282
17889,17889,dialog_10672.txt,B:: Uh-huh.\nA:: Small island.\nB:: Yeah.\nA::...,6,0.337052,0.337052,,"{'noun_ttr': 0.5, 'verb_ttr': 0.0, 'adj_ttr': ...","{'noun_ttr': 1.0, 'verb_ttr': 0.0, 'adj_ttr': ...","{'noun_ttr': 0.0, 'verb_ttr': 0.0, 'adj_ttr': ...",-0.212754,-0.059267,-0.441778
17890,17890,dialog_06125.txt,A:: Thanks.\nB:: Thank you.\nA:: Bye.\nB:: Goo...,6,0.134538,,0.134538,"{'noun_ttr': 0.5, 'verb_ttr': 1.0, 'adj_ttr': ...","{'noun_ttr': 1.0, 'verb_ttr': 0.0, 'adj_ttr': ...","{'noun_ttr': 1.0, 'verb_ttr': 1.0, 'adj_ttr': ...",1.724400,-0.442116,0.220894
17891,17891,dialog_13613.txt,A:: Okay.\nB:: Thanks.\nA:: Thank you.\nB:: By...,6,0.134538,0.134538,,"{'noun_ttr': 0.4444444444444444, 'verb_ttr': 1...","{'noun_ttr': 1.0, 'verb_ttr': 1.0, 'adj_ttr': ...","{'noun_ttr': 0.6666666666666666, 'verb_ttr': 0...",1.262220,-0.175444,0.370136


In [232]:
data_to_annotate.to_csv('switchboard_taaco_results/taaco_vocab_metrics_results_filtered_with_speakers.csv')