In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

### Clean VoC and Coraal Manifests

In [33]:
coraal = pd.read_csv("../manifests/coraal_manifest.csv")
coraal.rename(columns={" groundtruth_text_train ": "groundtruth_text_train"}, inplace=True)
coraal["AAVE"] = 1
coraal.dropna(subset=['groundtruth_text_train'], inplace=True)

In [34]:
coraal.head()

Unnamed: 0,wav_file,txt_file,groundtruth_text_raw,groundtruth_text_train,duration,AAVE
0,../data_processed/wav/ATL_se0_ag1_f_03_1_part_...,../data_processed/txt/ATL_se0_ag1_f_03_1_part_...,"Um, my mom used to be a school teacher, and th...",UM MY MOM USED TO BE A SCHOOL TEACHER AND THEN...,8.221,1
1,../data_processed/wav/ATL_se0_ag1_f_03_1_part_...,../data_processed/txt/ATL_se0_ag1_f_03_1_part_...,"And my dad, um, he did- he played soccer and h...",AND MY DAD UM HE DID HE PLAYED SOCCER AND HE U...,10.518,1
2,../data_processed/wav/ATL_se0_ag1_f_03_1_part_...,../data_processed/txt/ATL_se0_ag1_f_03_1_part_...,"Well he- he, um, was a seamstress and then whe...",WELL HE HE UM WAS A SEAMSTRESS AND THEN WHEN H...,5.274,1
3,../data_processed/wav/ATL_se0_ag1_f_03_1_part_...,../data_processed/txt/ATL_se0_ag1_f_03_1_part_...,<laugh> No. (laughing) A all white like your c...,NO A ALL WHITE LIKE YOUR CLOTHES ALL WHITE CLO...,5.589,1
4,../data_processed/wav/ATL_se0_ag1_f_03_1_part_...,../data_processed/txt/ATL_se0_ag1_f_03_1_part_...,"Um, I mean it's different things. I mean it de...",UM I MEAN IT'S DIFFERENT THINGS I MEAN IT DEPE...,10.441,1


In [35]:
voc = pd.read_csv("../manifests/voc_manifest.csv")
voc["wav_file"] = "../" + voc["wav_file"]
voc["txt_file"] = "../" + voc["txt_file"]
voc.rename(columns={"groundtruth_text": "groundtruth_text_raw", "cleaned_text": "groundtruth_text_train"}, inplace=True)
voc["AAVE"] = 0
voc.dropna(subset=['groundtruth_text_train'], inplace=True)
voc = voc[voc["wav_file"].str.split('_').str.get(3) != '3'] #Remove Spenser Deardoff (transcript + wav file don't match up)
voc = voc[voc["wav_file"].str.split('_').str.get(3) != '5'] #Remove Eric Heryford (wav file has some silences)
#remove high loss data points
examples = [
    '../data_processed_voc/wav/voc_33_part_3.wav', #Transcript: UM
    '../data_processed_voc/wav/voc_2_part_1.wav', #Transcript: UM
    '../data_processed_voc/wav/voc_12_part_198.wav', # KILL
    '../data_processed_voc/wav/voc_0_part_330.wav', #KILL
    '../data_processed_voc/wav/voc_0_part_328.wav', # Basically silent
    '../data_processed_voc/wav/voc_10_part_414.wav', # Basically silent
    '../data_processed_voc/wav/voc_10_part_404.wav', #Transcript: UM
    '../data_processed_voc/wav/voc_2_part_4.wav', #Transcript: UM YEAH SURE
    '../data_processed_voc/wav/voc_8_part_228.wav', #KILL
    '../data_processed_voc/wav/voc_26_part_207.wav', #Transcript: Incorrect
    '../data_processed_voc/wav/voc_13_part_234.wav', #Transcript: Incorrect
    '../data_processed_voc/wav/voc_12_part_185.wav' #KILL
]
voc = voc[~voc["wav_file"].isin(examples)]

In [36]:
voc.head()

Unnamed: 0,wav_file,txt_file,groundtruth_text_raw,groundtruth_text_train,duration,AAVE
0,../data_processed_voc/wav/voc_0_part_1.wav,../data_processed_voc/txt/voc_0_part_1.txt,He's also a historian. He's he is the presiden...,HE'S ALSO A HISTORIAN HE'S HE IS THE PRESIDENT...,6.006,0
1,../data_processed_voc/wav/voc_0_part_2.wav,../data_processed_voc/txt/voc_0_part_2.txt,Alright. That's good. So you were born in Redd...,ALRIGHT THAT'S GOOD SO YOU WERE BORN IN REDDIN...,6.433,0
2,../data_processed_voc/wav/voc_0_part_3.wav,../data_processed_voc/txt/voc_0_part_3.txt,Because we had just one hospital and back then...,BECAUSE WE HAD JUST ONE HOSPITAL AND BACK THEN...,8.561,0
3,../data_processed_voc/wav/voc_0_part_4.wav,../data_processed_voc/txt/voc_0_part_4.txt,actually south Redding uh wasn't uptown but it...,ACTUALLY SOUTH REDDING UH WASN'T UPTOWN BUT IT...,6.121,0
4,../data_processed_voc/wav/voc_0_part_5.wav,../data_processed_voc/txt/voc_0_part_5.txt,Well Lola A little uh Pine Street School that ...,WELL LOLA A LITTLE UH PINE STREET SCHOOL THAT ...,11.91,0


### Train/Dev/Test Split

In [37]:
FRACTION_TEST = 0.05
FRACTION_DEV = 0.05

In [38]:
voc_train_val, voc_test = train_test_split(voc, test_size=FRACTION_TEST, random_state=33)
voc_train, voc_val = train_test_split(voc_train_val, test_size=FRACTION_DEV / (1 - FRACTION_TEST), random_state=32)
coraal_train_val, coraal_test = train_test_split(coraal, test_size=FRACTION_TEST, random_state=31)
coraal_train, coraal_val = train_test_split(coraal_train_val, test_size=FRACTION_DEV / (1 - FRACTION_TEST), random_state=30)

In [39]:
#To create separate manifests for voc
voc_val = shuffle(voc_val, random_state=29)[['wav_file', 'txt_file', 'groundtruth_text_train', 'AAVE', 'groundtruth_text_raw', 'duration']]
voc_test = shuffle(voc_test, random_state=28)[['wav_file', 'txt_file', 'groundtruth_text_train', 'AAVE', 'groundtruth_text_raw', 'duration']]
voc_train = shuffle(voc_train, random_state=27)[['wav_file', 'txt_file', 'groundtruth_text_train', 'AAVE', 'groundtruth_text_raw', 'duration']]
print(len(voc_val))
print(len(voc_test))
print(len(voc_train))
voc_val.to_csv("../manifests/voc_val_manifest.csv", index=False, header=False)
voc_test.to_csv("../manifests/voc_test_manifest.csv", index=False, header=False)
voc_train.to_csv("../manifests/voc_train_manifest.csv", index=False, header=False)

408
408
7342


In [40]:
print(voc_val.duration.sum()/3600)
print(voc_test.duration.sum()/3600)
print(voc_train.duration.sum()/3600)

1.8595530555555555
1.8892536111111111
33.40277555555556


In [41]:
#To create separate manifests for coraal
coraal_val = shuffle(coraal_val, random_state=29)[['wav_file', 'txt_file', 'groundtruth_text_train', 'AAVE', 'groundtruth_text_raw', 'duration']]
coraal_test = shuffle(coraal_test, random_state=28)[['wav_file', 'txt_file', 'groundtruth_text_train', 'AAVE', 'groundtruth_text_raw', 'duration']]
coraal_train = shuffle(coraal_train, random_state=27)[['wav_file', 'txt_file', 'groundtruth_text_train', 'AAVE', 'groundtruth_text_raw', 'duration']]
print(len(coraal_val))
print(len(coraal_test))
print(len(coraal_train))

629
629
11308


In [42]:
print(coraal_val.duration.sum()/3600)
print(coraal_test.duration.sum()/3600)
print(coraal_train.duration.sum()/3600)

2.5724905555555555
2.5373119444444447
45.780607777777774


In [43]:
#select fraction to make size of CORAAL same as size of VOC (define size as total duration of audio)
SUBSET_FRAC = voc_val.duration.sum() / coraal_val.duration.sum()
SUBSET_FRAC

0.7228609844804527

In [44]:
coraal_val = coraal_val.sample(frac = SUBSET_FRAC, random_state= 29)
coraal_test = coraal_test.sample(frac = SUBSET_FRAC, random_state= 28)
coraal_train = coraal_train.sample(frac = SUBSET_FRAC, random_state= 27)
print(coraal_val.duration.sum()/3600)
print(coraal_test.duration.sum()/3600)
print(coraal_train.duration.sum()/3600)

1.8563100000000001
1.8168497222222222
33.080753333333334


In [45]:
coraal_val.to_csv("../manifests/coraal_val_manifest.csv", index=False, header=False)
coraal_test.to_csv("../manifests/coraal_test_manifest.csv", index=False, header=False)
coraal_train.to_csv("../manifests/coraal_train_manifest.csv", index=False, header=False)

# Create Combined manifests

In [46]:
val = shuffle(pd.concat([voc_val, coraal_val], axis=0), random_state=29)[['wav_file', 'txt_file', 'groundtruth_text_train', 'AAVE', 'groundtruth_text_raw', 'duration']]
test = shuffle(pd.concat([voc_test, coraal_test], axis=0), random_state=28)[['wav_file', 'txt_file', 'groundtruth_text_train', 'AAVE', 'groundtruth_text_raw', 'duration']]
train = shuffle(pd.concat([voc_train, coraal_train], axis=0), random_state=27)[['wav_file', 'txt_file', 'groundtruth_text_train', 'AAVE', 'groundtruth_text_raw', 'duration']]

In [47]:
val.to_csv("../manifests/val_manifest.csv", index=False, header=False)
test.to_csv("../manifests/test_manifest.csv", index=False, header=False)
train.to_csv("../manifests/train_manifest.csv", index=False, header=False)

### Create Manifests to Specs

In [9]:
FRACTION_AAVE = 0.5
N_EXAMPLES = 1000

In [10]:
coraal_train_mini = coraal_train.sample(n=int(N_EXAMPLES * FRACTION_AAVE), random_state=26)[['wav_file', 'txt_file', 'groundtruth_text_train', 'AAVE', 'groundtruth_text_raw', 'duration']]
voc_train_mini = voc_train.sample(n=int(N_EXAMPLES * (1 - FRACTION_AAVE)), random_state=25)[['wav_file', 'txt_file', 'groundtruth_text_train', 'AAVE', 'groundtruth_text_raw', 'duration']]

In [11]:
train_mini = shuffle(pd.concat([coraal_train_mini, voc_train_mini], axis=0), random_state=24)[['wav_file', 'txt_file', 'groundtruth_text_train', 'AAVE', 'groundtruth_text_raw', 'duration']]
val_mini = val.sample(n=int(N_EXAMPLES * FRACTION_DEV), random_state=23)[['wav_file', 'txt_file', 'groundtruth_text_train', 'AAVE', 'groundtruth_text_raw', 'duration']]

In [12]:
train_mini.to_csv("../manifests/train_"+str(N_EXAMPLES)+"_"+str(int(FRACTION_AAVE * 100))+"_"+str(int((1 - FRACTION_AAVE) * 100))+".csv", index=False, header=False)
val_mini.to_csv("../manifests/val_"+str(N_EXAMPLES)+"_"+str(int(FRACTION_AAVE * 100))+"_"+str(int((1 - FRACTION_AAVE) * 100))+".csv", index=False, header=False)

In [13]:
FRACTION_AAVE = 0.1
N_EXAMPLES = 1000

In [14]:
coraal_train_mini = coraal_train.sample(n=int(N_EXAMPLES * FRACTION_AAVE), random_state=26)[['wav_file', 'txt_file', 'groundtruth_text_train', 'AAVE', 'groundtruth_text_raw', 'duration']]
voc_train_mini = voc_train.sample(n=int(N_EXAMPLES * (1 - FRACTION_AAVE)), random_state=25)[['wav_file', 'txt_file', 'groundtruth_text_train', 'AAVE', 'groundtruth_text_raw', 'duration']]

In [15]:
train_mini = shuffle(pd.concat([coraal_train_mini, voc_train_mini], axis=0), random_state=24)[['wav_file', 'txt_file', 'groundtruth_text_train', 'AAVE', 'groundtruth_text_raw', 'duration']]
val_mini = val.sample(n=int(N_EXAMPLES * FRACTION_DEV), random_state=23)[['wav_file', 'txt_file', 'groundtruth_text_train', 'AAVE', 'groundtruth_text_raw', 'duration']]

In [16]:
train_mini.to_csv("../manifests/train_"+str(N_EXAMPLES)+"_"+str(int(FRACTION_AAVE * 100))+"_"+str(int((1 - FRACTION_AAVE) * 100))+".csv", index=False, header=False)
val_mini.to_csv("../manifests/val_"+str(N_EXAMPLES)+"_"+str(int(FRACTION_AAVE * 100))+"_"+str(int((1 - FRACTION_AAVE) * 100))+".csv", index=False, header=False)

### For debugging

In [17]:
coraal_train_mini.sample(n=100, random_state=22).to_csv("../manifests/coraal_train_"+str(100)+".csv", index=False, header=False)

In [18]:
voc_train_mini.sample(n=100, random_state=21).to_csv("../manifests/voc_train_"+str(100)+".csv", index=False, header=False)