In [1]:
import os
import sys
import pandas as pd
import numpy as np
import re
from os.path import join, exists

In [2]:
sys.path.append('../../')
from src.utils import split_gen, data_cleaning, load_splits, configuration
config = configuration.Config()
np.random.seed(config.SEED)

# Download and Preprocess Switchboard

In [11]:
!mkdir ../../data/switchboard

In [13]:
!curl -o ../../data/switchboard/ptree_word_alignments.tar.gz https://isip.piconepress.com/projects/switchboard/releases/ptree_word_alignments.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 20.1M  100 20.1M    0     0  15.1M      0  0:00:01  0:00:01 --:--:-- 15.1M


In [17]:
%%capture
!tar -xvzf ../../data/switchboard/ptree_word_alignments.tar.gz -C ../../data/switchboard/;

In [96]:
def clean_swbd_word(w):
    cleaned_w = re.sub("\\[laughter-", "", w)
    cleaned_w = re.sub("\\[vocalized", "", cleaned_w)
    cleaned_w = re.sub("\\[", "", cleaned_w)
    cleaned_w = re.sub("\\]", "", cleaned_w)
    cleaned_w = re.sub("\\-$", "", cleaned_w)
    cleaned_w = re.sub("^\\-", "", cleaned_w)
    cleaned_w = re.sub("\\{", "", cleaned_w)
    cleaned_w = re.sub("\\}", "",cleaned_w)        
    cleaned_w = re.sub("_1", "",cleaned_w)        
    cleaned_w = re.sub("^-$", "",cleaned_w)        
    cleaned_w = re.sub("^.*/$", "",cleaned_w)        
    cleaned_w = cleaned_w.lower()
    return(cleaned_w)
    

def read_penn_tagged_swithboard_file(path, tokens_to_exclude):
    df = pd.read_table(path, sep='\t', header=None)
    df.columns = ['id', 'speaker.utt','start','stop', 'unk','word1','word2']
    
    df = df.loc[~df.word2.isin(tokens_to_exclude)]
    df['utt_id'] = [int(x.split('.')[1]) for x in df['speaker.utt']]
    df['speaker'] = [x.split('.')[0] for x in df['speaker.utt']]
    df['conversation'] = [re.sub('A|B','',  x.split('-')[0]) for x  in df['id']]

    
    # preprocess the words with the function below
    df['cleaned_word'] = [clean_swbd_word(w) for w in df['word2']]
    
    # pull out the start time for everything with the new system
    utts = df.groupby(['utt_id', 'speaker', 'conversation']).cleaned_word.agg(lambda x: ' '.join(x)).reset_index()
    return(utts)
                              

def read_penn_tagged_swbd_pair(a_path, tokens_to_exclude):
    b_path = a_path.replace('A-','B-')
    
    a_df = read_penn_tagged_swithboard_file(a_path, tokens_to_exclude)
    b_df = read_penn_tagged_swithboard_file(b_path, tokens_to_exclude)
        
    combined = pd.concat([a_df, b_df]).sort_values(by=['utt_id'])
    return(combined)


In [103]:
tokens_to_exclude = ["[vocalized-noise]", "[noise]", "[laughter]",
"[silence]", "","+++"]

test = read_penn_tagged_swbd_pair(sample_path1, tokens_to_exclude)
test

Unnamed: 0,utt_id,speaker,conversation,cleaned_word
0,1,A,sw2719,do you exercise a lot
0,2,B,sw2719,um it's uh scaled back considerably uh right n...
1,3,A,sw2719,is mowing the lawn a new form of exercise
1,4,B,sw2719,yeah lawn it's a forty year old house so
2,5,A,sw2719,oh my
...,...,...,...,...
62,130,B,sw2719,that's right it only takes one time
65,131,A,sw2719,that's right
63,132,B,sw2719,that is it's a hard lesson to learn
66,133,A,sw2719,there's only one crazy out there you know to m...


In [104]:
a_paths = glob.glob('../../data/switchboard/data/alignments/*/*A-*.text')

In [139]:
from joblib import Parallel, delayed
results = Parallel(n_jobs=48)(delayed(read_penn_tagged_swbd_pair)(a_path,tokens_to_exclude) for a_path in a_paths)

In [140]:
all_convos = pd.concat(results)

In [141]:
all_convos

Unnamed: 0,utt_id,speaker,conversation,cleaned_word
0,1,A,sw4660,okay how do you get your news mostly
0,2,@B,sw4660,generally i get most of my news from uh the ra...
1,3,A,sw4660,um-hum um-hum
1,4,B,sw4660,and just fall off on the articles that i like
2,5,A,sw4660,um-hum
...,...,...,...,...
116,239,A,sw3057,well around here we have some you know the chu...
116,240,B,sw3057,um-hum
117,241,A,sw3057,uh take your the children and
117,242,B,sw3057,um-hum


In [142]:
swbd_txt_path = os.path.join(config.project_root, 'output/finetune/switchboard/all', 'switchboard_cleaned.txt')
all_convos[['cleaned_word']].to_csv(swbd_txt_path, header=False, index=False)

## "all" split

In [143]:
this_split_folder = split_gen.get_split_folder('switchboard', 'all', config.finetune_dir)
this_split_folder

'/home/stephan/notebooks/child-directed-listening/output/finetune/switchboard/all'

In [144]:
all_convos.columns =  ['utt_id','speaker','conversation','gloss']
all_convos['gloss_with_punct'] = [x+'.' for x in all_convos['gloss']]
all_convos['utterance_id'] = range(all_convos.shape[0])

In [145]:
import imp
imp.reload(split_gen)

<module 'src.utils.split_gen' from '/home/stephan/notebooks/child-directed-listening/src/tier_1/../../src/utils/split_gen.py'>

In [146]:
split_glosses_df, train_df = split_gen.exec_split_gen(all_convos, this_split_folder, 'val', phase_label = 'phase_finetune', split_on='conversation')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utt_data['contains_error'] = ['xxx' in str(x) or 'yyy' in str(x) for x in all_lowercase]


File written to /home/stephan/notebooks/child-directed-listening/output/finetune/switchboard/all/train.txt
File written to /home/stephan/notebooks/child-directed-listening/output/finetune/switchboard/all/train_no_tags.txt
File written to /home/stephan/notebooks/child-directed-listening/output/finetune/switchboard/all/val.txt
File written to /home/stephan/notebooks/child-directed-listening/output/finetune/switchboard/all/val_no_tags.txt
Writing split glosses to: /home/stephan/notebooks/child-directed-listening/output/finetune/switchboard/all/data_pool_with_phases.pkl


In [148]:
all_convos['word_count'] = [len(x.split(' ')) for x in all_convos['gloss']]

In [149]:
np.sum(all_convos['word_count'])
# only 1.5m words -- is this after removing silence tokens, etc.?

1463757