In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

### Clean VoC and Coraal Manifests

In [2]:
coraal = pd.read_csv("coraal_manifest.csv")
coraal.rename(columns={" groundtruth_text_train ": "groundtruth_text_train"}, inplace=True)
coraal["AAVE"] = 1
coraal.dropna(subset=['groundtruth_text_train'], inplace=True)

In [3]:
voc = pd.read_csv("voc_manifest.csv")
voc["wav_file"] = "../" + voc["wav_file"]
voc["txt_file"] = "../" + voc["txt_file"]
voc.rename(columns={"groundtruth_text": "groundtruth_text_raw", "cleaned_text": "groundtruth_text_train"}, inplace=True)
voc["AAVE"] = 0
voc.dropna(subset=['groundtruth_text_train'], inplace=True)

### Train/Dev/Test Split

In [4]:
FRACTION_TEST = 0.05
FRACTION_DEV = 0.05

In [5]:
voc_train_val, voc_test = train_test_split(voc, test_size=FRACTION_TEST, random_state=33)
voc_train, voc_val = train_test_split(voc_train_val, test_size=FRACTION_DEV / (1 - FRACTION_TEST), random_state=32)
coraal_train_val, coraal_test = train_test_split(coraal, test_size=FRACTION_TEST, random_state=31)
coraal_train, coraal_val = train_test_split(coraal_train_val, test_size=FRACTION_DEV / (1 - FRACTION_TEST), random_state=30)

In [6]:
val = shuffle(pd.concat([voc_val, coraal_val], axis=0), random_state=29)
test = shuffle(pd.concat([voc_test, coraal_test], axis=0), random_state=28)
train = shuffle(pd.concat([voc_train, coraal_train], axis=0), random_state=27)

In [7]:
val.to_csv("val_manifest.csv", index=False)
test.to_csv("test_manifest.csv", index=False)
train.to_csv("train_manifest.csv", index=False)

### Create Manifests to Specs

In [8]:
FRACTION_AAVE = 0.5
N_EXAMPLES = 1000

In [9]:
coraal_train_mini = coraal_train.sample(n=int(N_EXAMPLES * FRACTION_AAVE), random_state=26)
voc_train_mini = voc_train.sample(n=int(N_EXAMPLES * (1 - FRACTION_AAVE)), random_state=25)

In [10]:
train_mini = shuffle(pd.concat([coraal_train_mini, voc_train_mini], axis=0), random_state=24)
val_mini = val.sample(n=int(N_EXAMPLES * FRACTION_DEV), random_state=23)

In [13]:
train_mini.to_csv("train_"+str(N_EXAMPLES)+"_"+str(int(FRACTION_AAVE * 100))+"_"+str(int((1 - FRACTION_AAVE) * 100))+".csv", index=False)
val_mini.to_csv("val_"+str(N_EXAMPLES)+"_"+str(int(FRACTION_AAVE * 100))+"_"+str(int((1 - FRACTION_AAVE) * 100))+".csv", index=False)