# Data File Processing

#### Ryan Bales (@ryanbales)<br>ryan@balesofdata.com

***

### Import Packages

In [19]:
import pandas as pd
import os
import shutil

### Load Training Data

In [20]:
df_train_full = pd.read_csv("data/train.csv")

### Remove Training Examples that haven't been Manually Verified

In [21]:
df_train_full = df_train_full.loc[df_train_full['manually_verified'] == 1]

In [22]:
print(df_train_full.shape)
df_train_full.head()

(3710, 3)


Unnamed: 0,fname,label,manually_verified
1,001ca53d.wav,Saxophone,1
3,0033e230.wav,Glockenspiel,1
4,00353774.wav,Cello,1
6,003da8e5.wav,Knock,1
7,0048fd00.wav,Gunshot_or_gunfire,1


### Split Original Dataset into two DataFrames (data and labels) (X,y)

This is needed so we can do a stratified split to ensure we split equally across all labels

In [23]:
data = df_train_full.drop('label', axis=1)
labels = df_train_full.label

### Shuffle and Split the Full Training Dataset

In [24]:
# This is running a StratifiedShuffleSplit in sklearn
import sklearn.model_selection as skms
X_train, X_validation, y_train, y_validation = skms.train_test_split(data, labels,
                                                                     test_size=0.2, train_size=0.8,
                                                                     random_state=42, stratify=labels)

### Merge X and y for the Training Set and Validation Set after Shuffling

In [25]:
df_train = pd.DataFrame({"fname": X_train['fname'], "manually_verified": X_train['manually_verified'], "label": y_train})
df_validation = pd.DataFrame({"fname": X_validation['fname'], "manually_verified": X_validation['manually_verified'], "label": y_validation})

In [26]:
print(df_train.shape)
df_train.head()

(2968, 3)


Unnamed: 0,fname,manually_verified,label
6515,b22ea7c6.wav,1,Computer_keyboard
7487,cb7d9924.wav,1,Trumpet
5103,8a45aa41.wav,1,Gunshot_or_gunfire
7919,d6c9f529.wav,1,Tambourine
3531,5f011d45.wav,1,Finger_snapping


In [27]:
print(df_validation.shape)
df_validation.head()

(742, 3)


Unnamed: 0,fname,manually_verified,label
5792,9df6bdd3.wav,1,Glockenspiel
4055,6d3722db.wav,1,Gong
1346,2500d6e9.wav,1,Telephone
3536,5f20684f.wav,1,Cough
6810,b9dd574a.wav,1,Acoustic_guitar


### Save Split Datasets to CSV 

In [28]:
df_train.to_csv("data/train_split.csv", index=False)

In [29]:
df_validation.to_csv("data/validation_split.csv", index=False)

### Move Audio Files based on which dataset split they are within

In [30]:
def move_audio_file(row, dataset_split_name):
    # Build Source and Dest Paths
    src_path = "data/audio_train/{}".format(row['fname'])
    dest_folder = "data/audio_train/{}/{}".format(dataset_split_name, row['label'])
    dest_path = "{}/{}".format(dest_folder, row['fname'])
    
    print("Moving file {} to {}".format(src_path, dest_path))
    
    # Create Folder if it doesn't already exist
    if not os.path.exists(dest_folder):
        os.makedirs(dest_folder)
    
    # Move Audio File
    shutil.move(src_path, dest_path)

### Process Training Set Audio Files

In [31]:
for index, row in df_train.iterrows():
    move_audio_file(row, "train")

Moving file data/audio_train/b22ea7c6.wav to data/audio_train/train/Computer_keyboard/b22ea7c6.wav
Moving file data/audio_train/cb7d9924.wav to data/audio_train/train/Trumpet/cb7d9924.wav
Moving file data/audio_train/8a45aa41.wav to data/audio_train/train/Gunshot_or_gunfire/8a45aa41.wav
Moving file data/audio_train/d6c9f529.wav to data/audio_train/train/Tambourine/d6c9f529.wav
Moving file data/audio_train/5f011d45.wav to data/audio_train/train/Finger_snapping/5f011d45.wav
Moving file data/audio_train/a139a928.wav to data/audio_train/train/Clarinet/a139a928.wav
Moving file data/audio_train/5ed0b693.wav to data/audio_train/train/Finger_snapping/5ed0b693.wav
Moving file data/audio_train/05149415.wav to data/audio_train/train/Knock/05149415.wav
Moving file data/audio_train/ea964ecc.wav to data/audio_train/train/Double_bass/ea964ecc.wav
Moving file data/audio_train/b021b8f1.wav to data/audio_train/train/Knock/b021b8f1.wav
Moving file data/audio_train/e5c11a2e.wav to data/audio_train/train/C

### Process Validation Set Audio Files

In [32]:
for index, row in df_validation.iterrows():
    move_audio_file(row, "validation")

Moving file data/audio_train/9df6bdd3.wav to data/audio_train/validation/Glockenspiel/9df6bdd3.wav
Moving file data/audio_train/6d3722db.wav to data/audio_train/validation/Gong/6d3722db.wav
Moving file data/audio_train/2500d6e9.wav to data/audio_train/validation/Telephone/2500d6e9.wav
Moving file data/audio_train/5f20684f.wav to data/audio_train/validation/Cough/5f20684f.wav
Moving file data/audio_train/b9dd574a.wav to data/audio_train/validation/Acoustic_guitar/b9dd574a.wav
Moving file data/audio_train/e7cd4355.wav to data/audio_train/validation/Finger_snapping/e7cd4355.wav
Moving file data/audio_train/dcd37383.wav to data/audio_train/validation/Saxophone/dcd37383.wav
Moving file data/audio_train/e39f888d.wav to data/audio_train/validation/Tearing/e39f888d.wav
Moving file data/audio_train/85bd7e1b.wav to data/audio_train/validation/Violin_or_fiddle/85bd7e1b.wav
Moving file data/audio_train/b5ab5cdc.wav to data/audio_train/validation/Glockenspiel/b5ab5cdc.wav
Moving file data/audio_tra