In [1]:
import os
import pandas as pd
import numpy as np
import librosa, librosa.display
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import random

import torch 
import torch.nn as nn 
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torchvision.transforms import transforms
from sklearn.preprocessing import StandardScaler 

### Identify Relevant Audio Files

In [38]:
meta = pd.read_csv(r'C:\Users\sally\Documents\Fall 2020\CIS 519 - Intro to Machine Learning\Project\audioclassification_meta.csv')
meta

Unnamed: 0,VoxCeleb1 ID,VGGFace1 ID,Gender,Nationality,Set
0,id10001,A.J._Buckley,m,Ireland,dev
1,id10002,A.R._Rahman,m,India,dev
2,id10003,Aamir_Khan,m,India,dev
3,id10004,Aaron_Tveit,m,USA,dev
4,id10005,Aaron_Yoo,m,USA,dev
...,...,...,...,...,...
1206,id11247,Zachary_Levi,m,USA,dev
1207,id11248,Zachary_Quinto,m,USA,dev
1208,id11249,Zack_Snyder,m,USA,dev
1209,id11250,Zoe_Saldana,f,USA,dev


In [39]:
directory = r'C:\Users\sally\Documents\vox1_dev_wav\wav'
filenames = []
for foldername in os.listdir(directory):
    folder_dir = os.path.join(directory,foldername)
    for subfoldername in os.listdir(folder_dir):
        subfolder_dir = os.path.join(folder_dir,subfoldername)
        for filename in os.listdir(subfolder_dir):
            file = os.path.join(subfolder_dir,filename)
            filenames.append((foldername,file))

In [40]:
files = pd.DataFrame(filenames)
files.rename(columns={0:'ID',1:'file'},inplace=True)

In [41]:
wav_df = meta.merge(files,left_on='VoxCeleb1 ID',right_on='ID')[['ID','Gender','Nationality','file']]

In [42]:
wav_df

Unnamed: 0,ID,Gender,Nationality,file
0,id10001,m,Ireland,C:\Users\sally\Documents\vox1_dev_wav\wav\id10...
1,id10001,m,Ireland,C:\Users\sally\Documents\vox1_dev_wav\wav\id10...
2,id10001,m,Ireland,C:\Users\sally\Documents\vox1_dev_wav\wav\id10...
3,id10001,m,Ireland,C:\Users\sally\Documents\vox1_dev_wav\wav\id10...
4,id10001,m,Ireland,C:\Users\sally\Documents\vox1_dev_wav\wav\id10...
...,...,...,...,...
148637,id11251,f,USA,C:\Users\sally\Documents\vox1_dev_wav\wav\id11...
148638,id11251,f,USA,C:\Users\sally\Documents\vox1_dev_wav\wav\id11...
148639,id11251,f,USA,C:\Users\sally\Documents\vox1_dev_wav\wav\id11...
148640,id11251,f,USA,C:\Users\sally\Documents\vox1_dev_wav\wav\id11...


In [43]:
# Obtain list of nationalities and count of gender
nationality_gender_count=pd.DataFrame(wav_df.groupby('Nationality')['Gender'].nunique()).reset_index()
nationality_gender_count=nationality_gender_count[nationality_gender_count['Gender'] > 1]['Nationality']

In [44]:
# Keep only nationalities with samples from both genders
wav_df=wav_df.merge(nationality_gender_count,left_on='Nationality',right_on='Nationality')

### Get Samples

In [45]:
# Get list of nationalities
listNat=wav_df['Nationality'].unique()

In [46]:
# Cap samples at 500 max
columns=['ID', 'Gender', 'Nationality','file']
samples = pd.DataFrame(columns=columns)
for i in range(len(listNat)):
    total = wav_df[wav_df['Nationality'] == listNat[i]].count()['ID']
    num = int(0.5 * total)
    if num < 500:
        samp = wav_df[wav_df['Nationality'] == listNat[i]].sample(num,random_state=42)
    else:
        samp = wav_df[wav_df['Nationality'] == listNat[i]].sample(500,random_state=42)
    samples = samples.append(samp)

In [47]:
samples.groupby('Nationality').nunique()

Unnamed: 0_level_0,ID,Gender,Nationality,file
Nationality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Australia,37,2,1,500
Canada,52,2,1,500
Chile,3,2,1,91
China,2,2,1,188
Croatia,3,2,1,144
Denmark,3,2,1,141
Germany,9,2,1,500
India,25,2,1,500
Ireland,15,2,1,500
Italy,8,2,1,274


### Feature Extraction

#### Unused Section: Leave just in case

In [None]:
# Unused
def extract_features(signal, sr, n_fft, hop_length, n_mfcc):    
    # Extract short-time fourier transform
    stft = librosa.core.stft(signal, hop_length=hop_length, n_fft=n_fft)
    
    # Extract log spectrogram
    log_spectrogram = extract_spectrogram(stft)
    
    # Extract MFCC
    MFCC = librosa.feature.mfcc(signal, n_fft=n_fft, hop_length=hop_length, n_mfcc=n_mfcc)
    
    # Extract chromagram
    chromagram = librosa.feature.chroma_stft(signal, sr=sr, hop_length=hop_length)
    
    # Extract harmonics and percussion
    harmonics, percussion = extract_harmonics_percussion(stft)
    
    return log_spectrogram, MFCC, chromagram, harmonics, percussion

In [None]:
def extract_spectrogram(stft):
    spectrogram = np.abs(stft)
    log_spectrogram = librosa.amplitude_to_db(spectrogram) #amplitude as a function of time and frequency
    
    return log_spectrogram

In [None]:
def extract_harmonics_percussion(stft):
    harm, perc = librosa.decompose.hpss(stft)
    harm = librosa.amplitude_to_db(np.abs(harm))
    perc = librosa.amplitude_to_db(np.abs(perc))
    
    return harm, perc

#### Feature Extraction: Helper Functions

In [51]:
# Filter frequency using FFT
def filter_signal(signal):
    # Take the Fourier transform of the data
    F = np.fft.fft(signal)

    # Filter out any with magnitude < 20
    F_filtered = np.array([0.0 if np.abs(x) < 20 else x for x in F])

    # Reconstruct the filtered signal
    filtered_signal = np.fft.ifft(F_filtered)
    filtered_signal = np.array([float(x) for x in filtered_signal])
    
    return filtered_signal

In [52]:
def get_spectrogram(file, sr, n_fft, hop_length):
    # Get signal from file
    signal, sampling_rate = librosa.load(file, sr=sr, duration=3)
    
    # Filter out noise
    filt_signal = filter_signal(signal)
    
    # Extract short-time fourier transform
    stft = librosa.core.stft(filt_signal, hop_length=hop_length, n_fft=n_fft)
    
    # Extract log spectrogram
    log_spectrogram = extract_spectrogram(stft)
    
    return log_spectrogram

In [None]:
def get_MFCC(file, n_mfcc):
    # Get signal from file
    signal, sampling_rate = librosa.load(file)
    
    # Filter out noise
    filt_signal = filter_signal(signal)
    
    # Extract MFCC
    MFCC = librosa.feature.mfcc(filt_signal, sr=sampling_rate, n_mfcc=n_mfcc)
#     MFCC_processed = np.mean(MFCC.T,axis=0)
    
    return MFCC

In [None]:
def get_MFCC_old(file, n_fft, hop_length, n_mfcc):
    # Get signal from file
    signal, sampling_rate = librosa.load(file)
    
    # Filter out noise
    filt_signal = filter_signal(signal)
    
    # Extract MFCC
    MFCC = librosa.feature.mfcc(filt_signal, n_fft=n_fft, hop_length=hop_length, n_mfcc=n_mfcc)
    MFCC_processed = np.mean(MFCC.T,axis=0)
    
    return MFCC_processed

In [None]:
def get_chromagram(file, sr, hop_length):
    # Get signal from file
    signal, sampling_rate = librosa.load(file, sr=sr, duration=3)
    
    # Filter out noise
    filt_signal = filter_signal(signal)
    
    # Extract chromagram
    chromagram = librosa.feature.chroma_stft(filt_signal, sr=sr, hop_length=hop_length)
    
    return chromagram

In [55]:
def get_harmonics_percussion(signal):    
    # Get signal from file
    signal, sampling_rate = librosa.load(file)
    
    # Filter out noise
    filt_signal = filter_signal(signal)
    
    # Extract short-time fourier transform
    stft = librosa.core.stft(filt_signal)
    
    # Extract harmonics and percussion
    harmonics, percussion = librosa.decompose.hpss(stft)
    harmonics = librosa.amplitude_to_db(np.abs(harmonics))
    percussion = librosa.amplitude_to_db(np.abs(percussion))
    
    return harmonics, percussion

#### Feature Extraction: Actual Extraction

In [None]:
# Define features
sr=8000                  # sampling rate
n_fft=2048               # number of samples
hop_length=512           # amount we shift each fourier transfer to the right
n_mfcc=13                # number of MFCCs to extract

In [48]:
# Obtain labels (as strings)
labels = np.array(samples['Nationality'])
gender = np.array(samples['Gender'])

In [None]:
# Extract spectrograms to start
features_spectrogram = samples['file'].apply(lambda x: get_spectrogram(x,sr,n_fft,hop_length))

In [None]:
# Extract MFCCs
features_MFCC = samples['file'].apply(lambda x: get_MFCC(x,n_mfcc))

In [None]:
# Extract chromagrams
features_chromagram = samples['file'].apply(lambda x: get_chromagram(x,sr,hop_length))

In [57]:
# Extract harmonics and percussion
features_harm, features_perc = samples['file'].apply(lambda x: get_harmonics_percussion(x))

  filtered_signal = np.array([float(x) for x in filtered_signal])


ValueError: too many values to unpack (expected 2)

In [None]:
# Cast spectrogram series into array
arr_features_spectrogram = np.array(features_spectrogram)

# Save spectrograms
np.save('features_spectrogram.npy', arr_features_spectrogram)
np.save('labels.npy', labels)

In [None]:
# Cast MFCCs series into array
arr_features_MFCCs = np.array(features_MFCC)

# Save MFCCs
# np.save('features_MFCC.npy', arr_features_MFCCs)
# np.save('labels.npy', labels)

np.save('features_MFCC_13_unflattened.npy', arr_features_MFCCs)
np.save('labels_40.npy', labels)

In [None]:
# Cast chromagram series into array
arr_features_chromagram = np.array(features_chromagram)

# Save MFCCs
np.save('features_chromagram.npy', arr_features_chromagram)
np.save('labels.npy', labels)

In [58]:
# Cast harmonics and percussion series into array
arr_features_harm = np.array(features_harm)
arr_features_perc = np.array(features_perc)

# Save MFCCs
np.save('features_harmonics.npy', arr_features_harm)
np.save('features_percussion.npy', arr_features_perc)
np.save('labels.npy', labels)

NameError: name 'features_harm' is not defined

### Modeling

#### Load and Manipulate Features

In [71]:
# Reload features when needed
# features=np.load('features_spectrogram.npy',allow_pickle=True)
features_MFCC=np.load('features_MFCC_40.npy',allow_pickle=True)
features_chromagram=np.load('features_chromagram.npy',allow_pickle=True)
labels_str=np.load('labels_40.npy',allow_pickle=True)

In [72]:
# Flatten features
features_MFCC_flattened = []
features_chromagram_flattened = []
for i in range(len(features_MFCC)):
    features_MFCC_flattened.append(features_MFCC[i].flatten())
    features_chromagram_flattened.append(features_chromagram[i].flatten())

features_MFCC_flattened = np.array(features_MFCC_flattened)
features_chromagram_flattened = np.array(features_chromagram_flattened)

In [70]:
scaler = StandardScaler()
features_MFCC_flattened=scaler.fit_transform(features_MFCC_flattened)

scaler1 = StandardScaler()
features_chromagram_flattened=scaler1.fit_transform(features_chromagram_flattened)

In [73]:
# Combine features
features = []
for i in range(len(features_MFCC_flattened)):
    combo = np.append(features_chromagram_flattened[i],features_MFCC_flattened[i])
    features.append(combo)
    
features=np.array(features)

In [74]:
# Turn labels into numeric values, create dictionary to map back later 
le = preprocessing.LabelEncoder()
le.fit(labels_str)
labels=le.transform(labels_str)

In [75]:
# Cast array to type float64 instead of object for tensors to work
features = [np.array(list(x),dtype=np.float64) for x in features]
features = np.array(features,dtype=np.float64)

labels = [np.long(x) for x in labels]
labels = np.array(labels,dtype=np.long)

#### Split into train and test sets: NORMAL

In [31]:
# Get train and test datasets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=142)

#### Split into train and test sets: ZERO-SHOT

In [76]:
features_male = []
labels_male = []
features_female = []
labels_female = []

for i in range(len(gender)):
    if gender[i] == 'm':
        features_male.append(features[i])
        labels_male.append(labels[i])
    else:
        features_female.append(features[i])
        labels_female.append(labels[i])
        
features_male = np.array(features_male)
features_female = np.array(features_female)
labels_male = np.array(labels_male)
labels_female = np.array(labels_female)

In [77]:
labels_male

array([8, 8, 8, ..., 5, 5, 5])

In [61]:
len(labels_female)

2672

In [78]:
# Get train and test datasets
X_train, X_val, y_train, y_val = train_test_split(features_male, labels_male, test_size=0.3, random_state=42)
_, X_test, _, y_test = train_test_split(features_female, labels_female, test_size=0.3, random_state=142)

#### Modeling: Helper Functions

In [63]:
def compute_loss_and_accuracy(network, data_loader):
    total_loss = 0
    num_correct = 0
    num_instances = 0

    cross_entropy_loss = torch.nn.CrossEntropyLoss()

    for X, y in data_loader:
        with torch.no_grad():
            y_pred = network(X)
            total_loss += cross_entropy_loss(y_pred,y).item() * X.size(0)

        for i in range(len(y_pred)):
            predicted = torch.argmax(y_pred[i])
            actual = y[i]

            if predicted == actual:
                num_correct += 1
                
        num_instances += X.size(0)
  
    accuracy = num_correct / num_instances
    average_loss = total_loss / num_instances

    return accuracy, average_loss

In [64]:
def run_experiment(network, train_data_loader, valid_data_loader, test_data_loader, optimizer):
    train_losses = []
    valid_accs = []

    cross_entropy_loss = torch.nn.CrossEntropyLoss()

    for epoch in range(100):
        print('Epoch: ' + str(epoch))
        total_loss = 0.0
        num_instances = 0

        for X, y in train_data_loader:
            optimizer.zero_grad()
            y_pred = network(X)

            loss = cross_entropy_loss(y_pred,y)
            total_loss+=loss.item() * X.size(0)
            loss.backward()

            optimizer.step()

            num_instances += X.size(0)

        train_loss = total_loss / num_instances
        train_acc, _ = compute_loss_and_accuracy(network, train_data_loader)
        valid_acc, _ = compute_loss_and_accuracy(network, valid_data_loader)
        print("Train accuracy: ",train_acc)
        print("Valid accuracy: ",valid_acc)

        train_losses.append(train_loss)
        valid_accs.append(valid_acc)
    test_acc, _ = compute_loss_and_accuracy(network, test_data_loader)
    print("Test accuracy: ",test_acc)
    
    return train_losses, valid_accs

#### MFCC Modeling

In [65]:
class Sequential(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.network=nn.Sequential(
            nn.Linear(604,256),
            nn.ReLU(),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Linear(64, 20),
            nn.Tanh()
        )
    def forward(self, X):
        return self.network(X)

In [79]:
# Initialize tensors - Sequential
X_train_tensor = torch.tensor(X_train, dtype=torch.float)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)

X_val_tensor = torch.tensor(X_val, dtype=torch.float)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)

X_test_tensor = torch.tensor(X_test, dtype=torch.float)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)


train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_data_loader = DataLoader(train_dataset, batch_size=64,shuffle=True)

valid_dataset = TensorDataset(X_val_tensor, y_val_tensor)
valid_data_loader = DataLoader(valid_dataset, batch_size=64)

test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
test_data_loader = DataLoader(test_dataset, batch_size=64)

In [None]:
network_mfcc = Sequential()
sgd = torch.optim.Adam(network_mfcc.parameters(), lr=0.0001)

train_losses, valid_accs = run_experiment(network_mfcc, train_data_loader, valid_data_loader, test_data_loader, sgd)

Epoch: 0
Train accuracy:  0.07978339350180505
Valid accuracy:  0.08501683501683502
Epoch: 1
Train accuracy:  0.05812274368231047
Valid accuracy:  0.06060606060606061
Epoch: 2
Train accuracy:  0.11263537906137185
Valid accuracy:  0.10101010101010101
Epoch: 3
Train accuracy:  0.14693140794223827
Valid accuracy:  0.12205387205387205
Epoch: 4
Train accuracy:  0.1523465703971119
Valid accuracy:  0.12457912457912458
Epoch: 5
Train accuracy:  0.1548736462093863
Valid accuracy:  0.13131313131313133
Epoch: 6
Train accuracy:  0.16462093862815885
Valid accuracy:  0.12794612794612795
Epoch: 7
Train accuracy:  0.17833935018050542
Valid accuracy:  0.15572390572390574
Epoch: 8
Train accuracy:  0.17942238267148014
Valid accuracy:  0.15151515151515152
Epoch: 9
Train accuracy:  0.1888086642599278
Valid accuracy:  0.15993265993265993
Epoch: 10
Train accuracy:  0.19422382671480146
Valid accuracy:  0.1590909090909091
Epoch: 11
Train accuracy:  0.20541516245487365
Valid accuracy:  0.1675084175084175
Epoch: 

#### Spectogram: IGNORE UNUSED

In [None]:
class Convolutional(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = torch.nn.Conv2d(in_channels=1,out_channels=10,kernel_size=2,stride=1,padding=0)
        self.pool1 = torch.nn.MaxPool2d(kernel_size=2,stride=2)
        
        self.conv2 = torch.nn.Conv2d(in_channels=10, out_channels=32, kernel_size=2, stride=1, padding=0)
        self.pool2 = torch.nn.MaxPool2d(kernel_size=2,stride=2)
        
        self.fc1 = torch.nn.Linear(in_features=32*2*11,out_features=100)
        self.drop1 = torch.nn.Dropout(0.2)
        self.fc2 = torch.nn.Linear(in_features=100,out_features=50)
        self.drop2 = torch.nn.Dropout(0.2)
        self.fc3 = torch.nn.Linear(in_features=50,out_features=20)


    def forward(self, X):
        batch_size = 64
        X = self.conv1(X)
        X = self.pool1(X)
        X = self.conv2(X)
        X = self.pool2(X)
        X = X.relu()
        X = X.view(batch_size, -1)
        X = self.fc1(X)
        X - self.drop1(X)
        X = X.relu()
        X = self.fc2(X)
        X - self.drop2(X)
        X = X.relu()
        X = self.fc3(X)

        return X

In [None]:
# class Sequential(torch.nn.Module):
#     def __init__(self):
#         super().__init__()
#         self.network=nn.Sequential(
#             nn.Linear(40,128),
#             nn.ReLU(),
#             nn.Dropout(0.5),
#             nn.Linear(128, 256),
#             nn.ReLU(),
#             nn.Dropout(0.5),
#             nn.Linear(256, 512),
#             nn.ReLU(),
#             nn.Dropout(0.5),
#             nn.Linear(512,64),
#             nn.ReLU(),
#             nn.Dropout(0.5),
#             nn.Linear(64,20),
#             nn.Tanh()
#         )
#     def forward(self, X):
#         return self.network(X)

In [None]:
# Initialize tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float)
X_train_tensor = X_train_tensor.reshape([X_train.shape[0],1,X_train.shape[1],X_train.shape[2]])
y_train_tensor = torch.tensor(y_train, dtype=torch.long)

X_test_tensor = torch.tensor(X_test, dtype=torch.float)
X_test_tensor = X_test_tensor.reshape([X_test.shape[0],1,X_test.shape[1],X_test.shape[2]])
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_data_loader = DataLoader(train_dataset, batch_size=64,shuffle=True)

valid_dataset = TensorDataset(X_test_tensor, y_test_tensor)
valid_data_loader = DataLoader(valid_dataset, batch_size=64)

In [None]:
network = Convolutional()
sgd = torch.optim.SGD(network.parameters(), lr=0.001)

train_losses, valid_accs = run_experiment(network, train_data_loader, valid_data_loader, sgd)

In [None]:
# Randomly drop some to make batch sizes even
# drop = random.sample(range(0,X_train.shape[0]),4608)
# X_train=X_train[list(drop)]
# y_train=y_train[list(drop)]

# drop = random.sample(range(0,X_test.shape[0]),1984)
# X_test=X_test[list(drop)]
# y_test=y_test[list(drop)]