In [10]:
import os
import sys
import pandas as pd
import numpy as np
import re
import glob
from os.path import join, exists

In [18]:
sys.path.append('../../')
from src.utils import split_gen, data_cleaning, load_splits, configuration, paths
config = configuration.Config()
np.random.seed(config.SEED)

# Download and Preprocess Switchboard

In [3]:
!mkdir ../../data/switchboard

mkdir: cannot create directory ‘../../data/switchboard’: File exists


In [4]:
!curl -o ../../data/switchboard/ptree_word_alignments.tar.gz https://isip.piconepress.com/projects/switchboard/releases/ptree_word_alignments.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 20.1M  100 20.1M    0     0  11.0M      0  0:00:01  0:00:01 --:--:-- 11.0M


In [5]:
%%capture
!tar -xvzf ../../data/switchboard/ptree_word_alignments.tar.gz -C ../../data/switchboard/;

In [6]:
def clean_swbd_word(w):
    cleaned_w = re.sub("\\[laughter-", "", w)
    cleaned_w = re.sub("\\[vocalized", "", cleaned_w)
    cleaned_w = re.sub("\\[", "", cleaned_w)
    cleaned_w = re.sub("\\]", "", cleaned_w)
    cleaned_w = re.sub("\\-$", "", cleaned_w)
    cleaned_w = re.sub("^\\-", "", cleaned_w)
    cleaned_w = re.sub("\\{", "", cleaned_w)
    cleaned_w = re.sub("\\}", "",cleaned_w)        
    cleaned_w = re.sub("_1", "",cleaned_w)        
    cleaned_w = re.sub("^-$", "",cleaned_w)        
    cleaned_w = re.sub("^.*/$", "",cleaned_w)        
    cleaned_w = cleaned_w.lower()
    return(cleaned_w)
    

def read_penn_tagged_swithboard_file(path, tokens_to_exclude):
    df = pd.read_table(path, sep='\t', header=None)
    df.columns = ['id', 'speaker.utt','start','stop', 'unk','word1','word2']
    
    df = df.loc[~df.word2.isin(tokens_to_exclude)]
    df['utt_id'] = [int(x.split('.')[1]) for x in df['speaker.utt']]
    df['speaker'] = [x.split('.')[0] for x in df['speaker.utt']]
    df['conversation'] = [re.sub('A|B','',  x.split('-')[0]) for x  in df['id']]

    
    # preprocess the words with the function below
    df['cleaned_word'] = [clean_swbd_word(w) for w in df['word2']]
    
    # pull out the start time for everything with the new system
    utts = df.groupby(['utt_id', 'speaker', 'conversation']).cleaned_word.agg(lambda x: ' '.join(x)).reset_index()
    return(utts)
                              

def read_penn_tagged_swbd_pair(a_path, tokens_to_exclude):
    b_path = a_path.replace('A-','B-')
    
    a_df = read_penn_tagged_swithboard_file(a_path, tokens_to_exclude)
    b_df = read_penn_tagged_swithboard_file(b_path, tokens_to_exclude)
        
    combined = pd.concat([a_df, b_df]).sort_values(by=['utt_id'])
    return(combined)


In [8]:
tokens_to_exclude = ["[vocalized-noise]", "[noise]", "[laughter]",
"[silence]", "","+++"]

In [11]:
a_paths = glob.glob('../../data/switchboard/data/alignments/*/*A-*.text')

In [12]:
from joblib import Parallel, delayed
results = Parallel(n_jobs=48)(delayed(read_penn_tagged_swbd_pair)(a_path,tokens_to_exclude) for a_path in a_paths)

In [13]:
all_convos = pd.concat(results)

In [14]:
all_convos

Unnamed: 0,utt_id,speaker,conversation,cleaned_word
0,1,A,sw4660,okay how do you get your news mostly
0,2,@B,sw4660,generally i get most of my news from uh the ra...
1,3,A,sw4660,um-hum um-hum
1,4,B,sw4660,and just fall off on the articles that i like
2,5,A,sw4660,um-hum
...,...,...,...,...
116,239,A,sw3057,well around here we have some you know the chu...
116,240,B,sw3057,um-hum
117,241,A,sw3057,uh take your the children and
117,242,B,sw3057,um-hum


In [15]:
swbd_txt_path = os.path.join(config.project_root, 'output/finetune/switchboard/all', 'switchboard_cleaned.txt')
all_convos[['cleaned_word']].to_csv(swbd_txt_path, header=False, index=False)

In [16]:
all_convos.columns =  ['utt_id','speaker','conversation','gloss']
all_convos['gloss_with_punct'] = [x+'.' for x in all_convos['gloss']]
all_convos['utterance_id'] = range(all_convos.shape[0])

In [22]:
all_convos['transcript_id'] = all_convos['conversation']

## "all" split

In [24]:
switchboard_split_folder = paths.get_directory({
            "task_phase" : 'extract_data',
            "training_split" : 'Switchboard',
            "training_dataset" : 'all',    
            "test_split" : None,
            "test_dataset" : None,
            "model_type" : None,
            "context_width" : None,
            "n_samples" : config.n_across_time,
            "task_name" : None,
            "use_tags" : False
})

split_glosses_df, train_df = split_gen.exec_split_gen(all_convos, switchboard_split_folder, 'train') 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utt_data['contains_error'] = ['xxx' in str(x) or 'yyy' in str(x) for x in all_lowercase]


File written to /home/stephan/notebooks/child-directed-listening/output/experiments/full_scale/extract_data/n=5000/Switchboard_all_no_tags/train.txt
File written to /home/stephan/notebooks/child-directed-listening/output/experiments/full_scale/extract_data/n=5000/Switchboard_all_no_tags/eval.txt
Writing split glosses to: /home/stephan/notebooks/child-directed-listening/output/experiments/full_scale/extract_data/n=5000/Switchboard_all_no_tags/data_pool_with_phases.pkl


In [25]:
all_convos['word_count'] = [len(x.split(' ')) for x in all_convos['gloss']]

In [26]:
np.sum(all_convos['word_count'])
# only 1.5m words -- is this after removing silence tokens, etc.?

1463757