In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

### Clean VoC and Coraal Manifests

In [2]:
coraal = pd.read_csv("../manifests/coraal_manifest.csv")
coraal.rename(columns={" groundtruth_text_train ": "groundtruth_text_train"}, inplace=True)
coraal["AAVE"] = 1
coraal.dropna(subset=['groundtruth_text_train'], inplace=True)

In [3]:
voc = pd.read_csv("../manifests/voc_manifest.csv")
voc["wav_file"] = "../" + voc["wav_file"]
voc["txt_file"] = "../" + voc["txt_file"]
voc.rename(columns={"groundtruth_text": "groundtruth_text_raw", "cleaned_text": "groundtruth_text_train"}, inplace=True)
voc["AAVE"] = 0
voc.dropna(subset=['groundtruth_text_train'], inplace=True)
voc = voc[voc["wav_file"].str.split('_').str.get(3) != '3'] #Remove Spenser Deardoff (transcript + wav file don't match up)
voc = voc[voc["wav_file"].str.split('_').str.get(3) != '5'] #Remove Eric Heryford (wav file has some silences)

In [4]:
voc.head()

Unnamed: 0,wav_file,txt_file,groundtruth_text_raw,groundtruth_text_train,duration,AAVE
0,../data_processed_voc/wav/voc_0_part_1.wav,../data_processed_voc/txt/voc_0_part_1.txt,He's also a historian. He's he is the presiden...,HE'S ALSO A HISTORIAN HE'S HE IS THE PRESIDENT...,6.006,0
1,../data_processed_voc/wav/voc_0_part_2.wav,../data_processed_voc/txt/voc_0_part_2.txt,Alright. That's good. So you were born in Redd...,ALRIGHT THAT'S GOOD SO YOU WERE BORN IN REDDIN...,6.433,0
2,../data_processed_voc/wav/voc_0_part_3.wav,../data_processed_voc/txt/voc_0_part_3.txt,Because we had just one hospital and back then...,BECAUSE WE HAD JUST ONE HOSPITAL AND BACK THEN...,8.561,0
3,../data_processed_voc/wav/voc_0_part_4.wav,../data_processed_voc/txt/voc_0_part_4.txt,actually south Redding uh wasn't uptown but it...,ACTUALLY SOUTH REDDING UH WASN'T UPTOWN BUT IT...,6.121,0
4,../data_processed_voc/wav/voc_0_part_5.wav,../data_processed_voc/txt/voc_0_part_5.txt,Well Lola A little uh Pine Street School that ...,WELL LOLA A LITTLE UH PINE STREET SCHOOL THAT ...,11.91,0


### Train/Dev/Test Split

In [5]:
FRACTION_TEST = 0.05
FRACTION_DEV = 0.05

In [6]:
voc_train_val, voc_test = train_test_split(voc, test_size=FRACTION_TEST, random_state=33)
voc_train, voc_val = train_test_split(voc_train_val, test_size=FRACTION_DEV / (1 - FRACTION_TEST), random_state=32)
coraal_train_val, coraal_test = train_test_split(coraal, test_size=FRACTION_TEST, random_state=31)
coraal_train, coraal_val = train_test_split(coraal_train_val, test_size=FRACTION_DEV / (1 - FRACTION_TEST), random_state=30)

In [7]:
val = shuffle(pd.concat([voc_val, coraal_val], axis=0), random_state=29)[['wav_file', 'txt_file', 'groundtruth_text_train', 'AAVE', 'groundtruth_text_raw', 'duration']]
test = shuffle(pd.concat([voc_test, coraal_test], axis=0), random_state=28)[['wav_file', 'txt_file', 'groundtruth_text_train', 'AAVE', 'groundtruth_text_raw', 'duration']]
train = shuffle(pd.concat([voc_train, coraal_train], axis=0), random_state=27)[['wav_file', 'txt_file', 'groundtruth_text_train', 'AAVE', 'groundtruth_text_raw', 'duration']]

In [8]:
val.to_csv("../manifests/val_manifest.csv", index=False, header=False)
test.to_csv("../manifests/test_manifest.csv", index=False, header=False)
train.to_csv("../manifests/train_manifest.csv", index=False, header=False)

### Create Manifests to Specs

In [9]:
FRACTION_AAVE = 0.5
N_EXAMPLES = 1000

In [10]:
coraal_train_mini = coraal_train.sample(n=int(N_EXAMPLES * FRACTION_AAVE), random_state=26)[['wav_file', 'txt_file', 'groundtruth_text_train', 'AAVE', 'groundtruth_text_raw', 'duration']]
voc_train_mini = voc_train.sample(n=int(N_EXAMPLES * (1 - FRACTION_AAVE)), random_state=25)[['wav_file', 'txt_file', 'groundtruth_text_train', 'AAVE', 'groundtruth_text_raw', 'duration']]

In [11]:
train_mini = shuffle(pd.concat([coraal_train_mini, voc_train_mini], axis=0), random_state=24)[['wav_file', 'txt_file', 'groundtruth_text_train', 'AAVE', 'groundtruth_text_raw', 'duration']]
val_mini = val.sample(n=int(N_EXAMPLES * FRACTION_DEV), random_state=23)[['wav_file', 'txt_file', 'groundtruth_text_train', 'AAVE', 'groundtruth_text_raw', 'duration']]

In [12]:
train_mini.to_csv("../manifests/train_"+str(N_EXAMPLES)+"_"+str(int(FRACTION_AAVE * 100))+"_"+str(int((1 - FRACTION_AAVE) * 100))+".csv", index=False, header=False)
val_mini.to_csv("../manifests/val_"+str(N_EXAMPLES)+"_"+str(int(FRACTION_AAVE * 100))+"_"+str(int((1 - FRACTION_AAVE) * 100))+".csv", index=False, header=False)

In [13]:
FRACTION_AAVE = 0.1
N_EXAMPLES = 1000

In [14]:
coraal_train_mini = coraal_train.sample(n=int(N_EXAMPLES * FRACTION_AAVE), random_state=26)[['wav_file', 'txt_file', 'groundtruth_text_train', 'AAVE', 'groundtruth_text_raw', 'duration']]
voc_train_mini = voc_train.sample(n=int(N_EXAMPLES * (1 - FRACTION_AAVE)), random_state=25)[['wav_file', 'txt_file', 'groundtruth_text_train', 'AAVE', 'groundtruth_text_raw', 'duration']]

In [15]:
train_mini = shuffle(pd.concat([coraal_train_mini, voc_train_mini], axis=0), random_state=24)[['wav_file', 'txt_file', 'groundtruth_text_train', 'AAVE', 'groundtruth_text_raw', 'duration']]
val_mini = val.sample(n=int(N_EXAMPLES * FRACTION_DEV), random_state=23)[['wav_file', 'txt_file', 'groundtruth_text_train', 'AAVE', 'groundtruth_text_raw', 'duration']]

In [16]:
train_mini.to_csv("../manifests/train_"+str(N_EXAMPLES)+"_"+str(int(FRACTION_AAVE * 100))+"_"+str(int((1 - FRACTION_AAVE) * 100))+".csv", index=False, header=False)
val_mini.to_csv("../manifests/val_"+str(N_EXAMPLES)+"_"+str(int(FRACTION_AAVE * 100))+"_"+str(int((1 - FRACTION_AAVE) * 100))+".csv", index=False, header=False)

### For debugging

In [17]:
coraal_train_mini.sample(n=100, random_state=22).to_csv("../manifests/coraal_train_"+str(100)+".csv", index=False, header=False)

In [18]:
voc_train_mini.sample(n=100, random_state=21).to_csv("../manifests/voc_train_"+str(100)+".csv", index=False, header=False)