In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

### Clean VoC and Coraal Manifests

In [2]:
coraal = pd.read_csv("../manifests/coraal_manifest.csv")
coraal.rename(columns={" groundtruth_text_train ": "groundtruth_text_train"}, inplace=True)
coraal["AAVE"] = 1
coraal.dropna(subset=['groundtruth_text_train'], inplace=True)

In [3]:
coraal['wav_file'] = coraal['wav_file'].apply(lambda x: '/juice/scr/cpajot/data_processed_coraal/wav/' + x.split("/")[-1])
coraal['txt_file'] = coraal['txt_file'].apply(lambda x: '/juice/scr/cpajot/data_processed_coraal/txt/' + x.split("/")[-1])

In [4]:
coraal.head()

Unnamed: 0,wav_file,txt_file,groundtruth_text_raw,groundtruth_text_train,duration,AAVE
0,/juice/scr/cpajot/data_processed_coraal/wav/AT...,/juice/scr/cpajot/data_processed_coraal/txt/AT...,"Um, my mom used to be a school teacher, and th...",UM MY MOM USED TO BE A SCHOOL TEACHER AND THEN...,8.221,1
1,/juice/scr/cpajot/data_processed_coraal/wav/AT...,/juice/scr/cpajot/data_processed_coraal/txt/AT...,"And my dad, um, he did- he played soccer and h...",AND MY DAD UM HE DID HE PLAYED SOCCER AND HE U...,10.518,1
2,/juice/scr/cpajot/data_processed_coraal/wav/AT...,/juice/scr/cpajot/data_processed_coraal/txt/AT...,"Well he- he, um, was a seamstress and then whe...",WELL HE HE UM WAS A SEAMSTRESS AND THEN WHEN H...,5.274,1
3,/juice/scr/cpajot/data_processed_coraal/wav/AT...,/juice/scr/cpajot/data_processed_coraal/txt/AT...,<laugh> No. (laughing) A all white like your c...,NO A ALL WHITE LIKE YOUR CLOTHES ALL WHITE CLO...,5.589,1
4,/juice/scr/cpajot/data_processed_coraal/wav/AT...,/juice/scr/cpajot/data_processed_coraal/txt/AT...,"Um, I mean it's different things. I mean it de...",UM I MEAN IT'S DIFFERENT THINGS I MEAN IT DEPE...,10.441,1


In [5]:
voc = pd.read_csv("../manifests/voc_manifest.csv")
voc["wav_file"] = "../" + voc["wav_file"]
voc["txt_file"] = "../" + voc["txt_file"]
voc.rename(columns={"groundtruth_text": "groundtruth_text_raw", "cleaned_text": "groundtruth_text_train"}, inplace=True)
voc["AAVE"] = 0
voc.dropna(subset=['groundtruth_text_train'], inplace=True)
voc = voc[voc["wav_file"].str.split('_').str.get(3) != '3'] #Remove Spenser Deardoff (transcript + wav file don't match up)
voc = voc[voc["wav_file"].str.split('_').str.get(3) != '5'] #Remove Eric Heryford (wav file has some silences)
#remove high loss data points
examples = [
    '../data_processed_voc/wav/voc_33_part_3.wav', #Transcript: UM
    '../data_processed_voc/wav/voc_2_part_1.wav', #Transcript: UM
    '../data_processed_voc/wav/voc_12_part_198.wav', # KILL
    '../data_processed_voc/wav/voc_0_part_330.wav', #KILL
    '../data_processed_voc/wav/voc_0_part_328.wav', # Basically silent
    '../data_processed_voc/wav/voc_10_part_414.wav', # Basically silent
    '../data_processed_voc/wav/voc_10_part_404.wav', #Transcript: UM
    '../data_processed_voc/wav/voc_2_part_4.wav', #Transcript: UM YEAH SURE
    '../data_processed_voc/wav/voc_8_part_228.wav', #KILL
    '../data_processed_voc/wav/voc_26_part_207.wav', #Transcript: Incorrect
    '../data_processed_voc/wav/voc_13_part_234.wav', #Transcript: Incorrect
    '../data_processed_voc/wav/voc_12_part_185.wav', #KILL
    '../data_processed_voc/wav/voc_12_part_181.wav', #KILL
    '../data_processed_voc/wav/voc_17_part_93.wav', #KILL
    '../data_processed_voc/wav/voc_10_part_428.wav' #KILL
#     '../data_processed_voc/wav/voc_11_part_80.wav' #Incorrect transcript
]
voc = voc[~voc["wav_file"].isin(examples)]

In [6]:
voc['wav_file'] = voc['wav_file'].apply(lambda x: '/juice/scr/cpajot/data_processed_voc/wav/' + x.split("/")[-1])
voc['txt_file'] = voc['txt_file'].apply(lambda x: '/juice/scr/cpajot/data_processed_voc/txt/' + x.split("/")[-1])

In [7]:
voc.head()

Unnamed: 0,wav_file,txt_file,groundtruth_text_raw,groundtruth_text_train,duration,AAVE
0,/juice/scr/cpajot/data_processed_voc/wav/voc_0...,/juice/scr/cpajot/data_processed_voc/txt/voc_0...,He's also a historian. He's he is the presiden...,HE'S ALSO A HISTORIAN HE'S HE IS THE PRESIDENT...,6.006,0
1,/juice/scr/cpajot/data_processed_voc/wav/voc_0...,/juice/scr/cpajot/data_processed_voc/txt/voc_0...,Alright. That's good. So you were born in Redd...,ALRIGHT THAT'S GOOD SO YOU WERE BORN IN REDDIN...,6.433,0
2,/juice/scr/cpajot/data_processed_voc/wav/voc_0...,/juice/scr/cpajot/data_processed_voc/txt/voc_0...,Because we had just one hospital and back then...,BECAUSE WE HAD JUST ONE HOSPITAL AND BACK THEN...,8.561,0
3,/juice/scr/cpajot/data_processed_voc/wav/voc_0...,/juice/scr/cpajot/data_processed_voc/txt/voc_0...,actually south Redding uh wasn't uptown but it...,ACTUALLY SOUTH REDDING UH WASN'T UPTOWN BUT IT...,6.121,0
4,/juice/scr/cpajot/data_processed_voc/wav/voc_0...,/juice/scr/cpajot/data_processed_voc/txt/voc_0...,Well Lola A little uh Pine Street School that ...,WELL LOLA A LITTLE UH PINE STREET SCHOOL THAT ...,11.91,0


### Train/Dev/Test Split

In [8]:
FRACTION_TEST = 0.05
FRACTION_DEV = 0.05

In [9]:
voc_train_val, voc_test = train_test_split(voc, test_size=FRACTION_TEST, random_state=33)
voc_train, voc_val = train_test_split(voc_train_val, test_size=FRACTION_DEV / (1 - FRACTION_TEST), random_state=32)
coraal_train_val, coraal_test = train_test_split(coraal, test_size=FRACTION_TEST, random_state=31)
coraal_train, coraal_val = train_test_split(coraal_train_val, test_size=FRACTION_DEV / (1 - FRACTION_TEST), random_state=30)

In [10]:
#To create separate manifests for voc
voc_val = shuffle(voc_val, random_state=29)[['wav_file', 'txt_file', 'groundtruth_text_train', 'AAVE', 'groundtruth_text_raw', 'duration']]
voc_test = shuffle(voc_test, random_state=28)[['wav_file', 'txt_file', 'groundtruth_text_train', 'AAVE', 'groundtruth_text_raw', 'duration']]
voc_train = shuffle(voc_train, random_state=27)[['wav_file', 'txt_file', 'groundtruth_text_train', 'AAVE', 'groundtruth_text_raw', 'duration']]
print(len(voc_val))
print(len(voc_test))
print(len(voc_train))
#WARNING: do not write VOC manifests in this notebook.  Instead, use the create_manifest-slurm_voc notebook
# voc_val.to_csv("../manifests_slurm/voc_val_manifest.csv", index=False, header=False)
# voc_test.to_csv("../manifests_slurm/voc_test_manifest.csv", index=False, header=False)
# voc_train.to_csv("../manifests_slurm/voc_train_manifest.csv", index=False, header=False)

408
408
7327


In [11]:
print(voc_val.duration.sum()/3600)
print(voc_test.duration.sum()/3600)
print(voc_train.duration.sum()/3600)

1.86784
1.9334258333333334
33.25129694444444


In [12]:
#To create separate manifests for coraal
coraal_val = shuffle(coraal_val, random_state=29)[['wav_file', 'txt_file', 'groundtruth_text_train', 'AAVE', 'groundtruth_text_raw', 'duration']]
coraal_test = shuffle(coraal_test, random_state=28)[['wav_file', 'txt_file', 'groundtruth_text_train', 'AAVE', 'groundtruth_text_raw', 'duration']]
coraal_train = shuffle(coraal_train, random_state=27)[['wav_file', 'txt_file', 'groundtruth_text_train', 'AAVE', 'groundtruth_text_raw', 'duration']]
print(len(coraal_val))
print(len(coraal_test))
print(len(coraal_train))

629
629
11308


In [13]:
print(coraal_val.duration.sum()/3600)
print(coraal_test.duration.sum()/3600)
print(coraal_train.duration.sum()/3600)

2.5724905555555555
2.5373119444444447
45.780607777777774


In [14]:
#select fraction to make size of CORAAL same as size of VOC (define size as total duration of audio)
SUBSET_FRAC = voc_val.duration.sum() / coraal_val.duration.sum()
SUBSET_FRAC

0.7260823546917243

In [15]:
coraal_val = coraal_val.sample(frac = SUBSET_FRAC, random_state= 29)
coraal_test = coraal_test.sample(frac = SUBSET_FRAC, random_state= 28)
coraal_train = coraal_train.sample(frac = SUBSET_FRAC, random_state= 27)
print(coraal_val.duration.sum()/3600)
print(coraal_test.duration.sum()/3600)
print(coraal_train.duration.sum()/3600)

1.864185277777778
1.8242063888888889
33.23975305555555


In [16]:
coraal_val.to_csv("../manifests_slurm/coraal_val_manifest.csv", index=False, header=False)
coraal_test.to_csv("../manifests_slurm/coraal_test_manifest.csv", index=False, header=False)
coraal_train.to_csv("../manifests_slurm/coraal_train_manifest.csv", index=False, header=False)

In [17]:
combined_train = shuffle(pd.concat([coraal_train, voc_train]), random_state = 1234)
combined_val = shuffle(pd.concat([coraal_val, voc_val]), random_state = 1234)
combined_test = shuffle(pd.concat([coraal_test, voc_test]), random_state = 1234)

In [18]:
print(combined_train.duration.sum()/3600)
print(combined_val.duration.sum()/3600)
print(combined_test.duration.sum()/3600)

66.49105
3.7320252777777783
3.757632222222222


In [26]:
print(len(combined_train))
print(len(combined_val))
print(len(combined_test))

15538
865
865


In [19]:
combined_train.to_csv("../manifests_slurm/combined_train_manifest.csv", header = False, index = False)
combined_val.to_csv("../manifests_slurm/combined_val_manifest.csv", header = False, index = False)
combined_test.to_csv("../manifests_slurm/combined_test_manifest.csv", header = False, index = False)

### For debugging

In [20]:
coraal_small = coraal_train.sample(n=1000, random_state=22)
coraal_small.to_csv("../manifests_slurm/coraal_train_"+str(1000)+".csv", index=False, header=False)

In [21]:
coraal_tiny = coraal_train.sample(n=100, random_state=22)
coraal_tiny.to_csv("../manifests_slurm/coraal_train_"+str(100)+".csv", index=False, header=False)

In [22]:
coraal_val_tiny = coraal_val.sample(n=100, random_state=22)
coraal_val_tiny.to_csv("../manifests_slurm/coraal_val_"+str(100)+".csv", index=False, header=False)

In [23]:
voc_small = voc_train.sample(n=1000, random_state=21)
voc_small.to_csv("../manifests/voc_train_"+str(1000)+".csv", index=False, header=False)

In [24]:
voc_tiny = voc_train.sample(n=100, random_state=21)
voc_tiny.to_csv("../manifests/voc_train_"+str(100)+".csv", index=False, header=False)

In [25]:
voc_val_tiny = voc_val.sample(n=100, random_state=21)
voc_val_tiny.to_csv("../manifests_slurm/voc_val_"+str(100)+".csv", index=False, header=False)