In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import random

import librosa
import librosa.display # must be explicitly imported
import IPython.display as ipd  # for playing .wav files in nb

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
import warnings
warnings.filterwarnings("ignore", message="Numerical issues were encountered ")
warnings.filterwarnings("ignore", message="lbfgs failed to converge ")

In [3]:
%%capture
from tqdm.notebook import tqdm
from time import sleep
tqdm().pandas()

In [None]:
# Import CSV
# Source: http://www.openslr.org/83/
df = pd.read_csv('line_index_all.csv', names=['lineID', 'filename', 'transcript'])

# 2. Create label column
df['filename'] = [filename.strip() for filename in df.filename] # Remove whitespace
df['speaker'] = [string[:9] for string in df.filename]
df['source'] = [string[:3] for string in df.filename] # three-letter code indicating speaker dialect+sex (& directory)
df
df['label'] = [string[:2] for string in df.source] # two-letter code indicating dialect (label)
df

In [None]:
# Dataframes (for iterating)

df_wef = df[df.source == 'wef']  # Welsh English Female
df_wem = df[df.source == 'wem']  # Welsh English Male
df_mif = df[df.source == 'mif']  # Midlands English Female
df_mim = df[df.source == 'mim']  # Midlands English Male
df_nof = df[df.source == 'nof']  # Northern English Female
df_nom = df[df.source == 'nom']  # Northern English Male
df_scf = df[df.source == 'scf']  # Scottish English Female
df_scm = df[df.source == 'scm']  # Scottish English Male
df_sof = df[df.source == 'sof']  # Southern English Female
df_som = df[df.source == 'som']  # Southern English Male

# Dictionaries (for referencing)

dict_wef = {'df':df_wef, 'label':'Welsh', 'path':'welsh_english_female/'}
dict_wem = {'df':df_wem, 'label':'Welsh', 'path':'welsh_english_male/'}
dict_mif = {'df':df_mif, 'label':'Midlands', 'path':'midlands_english_female/'}
dict_mim = {'df':df_mim, 'label':'Midlands', 'path':'midlands_english_male/'}
dict_nof = {'df':df_nof, 'label':'Northern', 'path':'northern_english_female/'}
dict_nom = {'df':df_nom, 'label':'Northern', 'path':'northern_english_male/'}
dict_scf = {'df':df_scf, 'label':'Scottish', 'path':'scottish_english_female/'}
dict_scm = {'df':df_scm, 'label':'Scottish', 'path':'scottish_english_male/'}
dict_sof = {'df':df_sof, 'label':'Southern', 'path':'southern_english_female/'}
dict_som = {'df':df_som, 'label':'Southern', 'path':'southern_english_male/'}

In [None]:
# Speaker list (for shuffling later)

speaker_list_we = list(df_wef.speaker.unique()) + list(df_wem.speaker.unique())
speaker_list_mi = list(df_mif.speaker.unique()) + list(df_mim.speaker.unique())
speaker_list_no = list(df_nof.speaker.unique()) + list(df_nom.speaker.unique())
speaker_list_sc = list(df_scf.speaker.unique()) + list(df_scm.speaker.unique())
speaker_list_so = list(df_sof.speaker.unique()) + list(df_som.speaker.unique())

In [None]:
# Generate column names

colnames_mfcc16 = []  # n=16 MFCCs

for n in range(16):
    num = n+1
    if num < 10:
        num = '0' + str(num)
    else:
        num = str(num)
    name = 'mfcc_' + num
    colnames_mfcc16.append(name)

colnames_mfcc16_del = [(name + '_del') for name in colnames_mfcc16]
colnames_mfcc16_del2 = [(name + '_del2') for name in colnames_mfcc16]

# Prepare class list (in alphabetical order)
labels = ['Midlands', 'Northern', 'Scottish', 'Southern', 'Welsh']

In [None]:
# For getting MFCC data (in different folders)

def get_mfccs(dictionary):
    
    df = dictionary['df']
    path = dictionary['path']

    all_mfccs = []
    all_mfccs_delta = []
    all_mfccs_delta2 = []
    
    for i, row in df.iterrows():
        
        # 1. Generate path
        full_path = path + str(row.filename) + '.wav'
        
        # 2. Import audio file
        y, sr = librosa.load(full_path, sr=22050)
        
        # 3. Calculate MFCCs
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=16)
        mfccs_delta = librosa.feature.delta(mfccs)
        mfccs_delta2 = librosa.feature.delta(mfccs, order=2)
        
        # 3.1 Scale MFCCs (OMIT) 
        #mfccs = preprocessing.scale(mfccs, axis=1)
        #mfccs_delta = preprocessing.scale(mfccs_delta, axis=1)
        #mfccs_delta2 = preprocessing.scale(mfccs_delta2, axis=1)
        
        # 4. Get mean MFCCs over recording
        mfccs_mean = [row.mean() for row in mfccs]
        all_mfccs.append(mfccs_mean)
        
        mfccs_delta_mean = [row.mean() for row in mfccs_delta]
        all_mfccs_delta.append(mfccs_delta_mean)
        
        mfccs_delta2_mean = [row.mean() for row in mfccs_delta2]
        all_mfccs_delta2.append(mfccs_delta_mean)
        
    # 5. Convert list of lists to array, then df
    a_mfccs = np.array(all_mfccs)
    a_mfccs_delta = np.array(all_mfccs_delta)
    a_mfccs_delta2 = np.array(all_mfccs_delta2)
    
    df_mfccs = pd.DataFrame(a_mfccs, index=df.index, columns=colnames_mfcc16)
    df_mfccs_delta = pd.DataFrame(a_mfccs_delta, index=df.index, columns=colnames_mfcc16_del)
    df_mfccs_delta2 = pd.DataFrame(a_mfccs_delta2, index=df.index, columns=colnames_mfcc16_del2)
    
    # 6. Join with original df
    df = pd.concat([df, df_mfccs, df_mfccs_delta, df_mfccs_delta2], axis=1)
    
    return df

In [None]:
# 3.1 Calculate MFCC data
# Run/pickle folders in individual cells in case of error/timeout

# df_wef_plus = get_mfccs(dict_wef)
# df_wef_plus.to_pickle("df_wef_mfcc16s.pkl") 

# df_wem_plus = get_mfccs(dict_wem)
# df_wem_plus.to_pickle("df_wem_mfcc16s.pkl")

# df_mif_plus = get_mfccs(dict_mif)
# df_mif_plus.to_pickle("df_mif_mfcc16s.pkl")

# df_mim_plus = get_mfccs(dict_mim)
# df_mim_plus.to_pickle("df_mim_mfcc16s.pkl")

# df_nof_plus = get_mfccs(dict_nof)
# df_nof_plus.to_pickle("df_nof_mfcc16s.pkl")

# df_nom_plus = get_mfccs(dict_nom)
# df_nom_plus.to_pickle("df_nom_mfcc16s.pkl")

# df_scf_plus = get_mfccs(dict_scf)
# df_scf_plus.to_pickle("df_scf_mfcc16s.pkl")

# df_scm_plus = get_mfccs(dict_scm)
# df_scm_plus.to_pickle("df_scm_mfcc16s.pkl")

# df_sof_plus = get_mfccs(dict_sof)
# df_sof_plus.to_pickle("df_sof_mfcc16s.pkl")

# df_som_plus = get_mfccs(dict_som)
# df_som_plus.to_pickle("df_som_mfcc16s.pkl")

In [4]:
# Unpickling (available 22 Oct)
df_wef_plus = pd.read_pickle("df_mfcc16s_wef.pkl")
df_wem_plus = pd.read_pickle("df_mfcc16s_wem.pkl")
df_mif_plus = pd.read_pickle("df_mfcc16s_mif.pkl")
df_mim_plus = pd.read_pickle("df_mfcc16s_mim.pkl")
df_nof_plus = pd.read_pickle("df_mfcc16s_nof.pkl")
df_nom_plus = pd.read_pickle("df_mfcc16s_nom.pkl")
df_scf_plus = pd.read_pickle("df_mfcc16s_scf.pkl")
df_scm_plus = pd.read_pickle("df_mfcc16s_scm.pkl")
df_sof_plus = pd.read_pickle("df_mfcc16s_sof.pkl")
df_som_plus = pd.read_pickle("df_mfcc16s_som.pkl")

## Join Formant Data Here

In [7]:
df_formants_mif = pd.read_pickle("df_formants_mif.pkl")
df_mif_new = pd.merge(df_mif_plus, df_formants_mif, how='left', on='filename')

df_formants_mim = pd.read_pickle("df_formants_mim.pkl")
df_mim_new = pd.merge(df_mim_plus, df_formants_mim, how='left', on='filename')

df_formants_nof = pd.read_pickle("df_formants_nof.pkl")
df_nof_new = pd.merge(df_nof_plus, df_formants_nof, how='left', on='filename')

df_formants_nom = pd.read_pickle("df_formants_nom.pkl")
df_nom_new = pd.merge(df_nom_plus, df_formants_nom, how='left', on='filename')

df_formants_wef = pd.read_pickle("df_formants_wef.pkl")
df_wef_new = pd.merge(df_wef_plus, df_formants_wef, how='left', on='filename')

df_formants_wem = pd.read_pickle("df_formants_wem.pkl")
df_wem_new = pd.merge(df_wem_plus, df_formants_wem, how='left', on='filename')

df_formants_scf = pd.read_pickle("df_formants_scf.pkl")
df_scf_new = pd.merge(df_scf_plus, df_formants_scf, how='left', on='filename')

df_formants_scm = pd.read_pickle("df_formants_scm.pkl")
df_scm_new = pd.merge(df_scm_plus, df_formants_scm, how='left', on='filename')

df_formants_sof = pd.read_pickle("df_formants_sof.pkl")
df_sof_new = pd.merge(df_sof_plus, df_formants_sof, how='left', on='filename')

df_formants_som = pd.read_pickle("df_formants_som.pkl")
df_som_new = pd.merge(df_som_plus, df_formants_som, how='left', on='filename')

In [8]:
colnames_formants = df_formants_mif.columns # or any of the other 9 df_formants

In [13]:
# Join dialects
df_full = pd.concat([df_wef_new, df_wem_new, df_mif_new, df_mim_new, \
                     df_nof_new, df_nom_new, df_scf_new, df_scm_new, \
                     df_sof_new, df_som_new], axis=0)

# Create gender column (for splitting; optional)
df_full.insert(5, 'gender', [string[2] for string in df_full.source])

# Save df
df_full.to_pickle("df_mfcc16s_and_formants.pkl")

In [10]:
df_full

Unnamed: 0,lineID,filename,transcript,speaker,source,label,mfcc_01,mfcc_02,mfcc_03,mfcc_04,...,stdevF0Hz,HNR,f1_mean,f2_mean,f3_mean,f4_mean,f1_median,f2_median,f3_median,f4_median
0,EN1223,wef_12484_01482829612,The sun provides energy,wef_12484,wef,we,-562.580688,24.975183,0.755791,6.428880,...,35.282550,13.250868,529.795441,1637.269021,2548.379542,3541.352741,417.659197,1596.514763,2557.510779,3406.419096
1,BI0113,wef_12484_01345932698,Zoe invited you to an event,wef_12484,wef,we,-514.508240,15.943781,-1.808712,3.045948,...,40.844751,13.191537,546.224534,1558.481714,2681.426868,3729.430666,502.368223,1452.792429,2810.206103,3582.478817
2,EN0971,wef_12484_00999757777,This is the cinematic superhero showdown you'...,wef_12484,wef,we,-405.116608,32.621223,-6.160967,6.723876,...,42.070621,10.352992,542.049284,1808.089588,2688.133074,3706.177346,509.912337,1780.273535,2758.173458,3727.909596
3,EN0026,wef_12484_00036278823,That quick beige fox jumped in the air over e...,wef_12484,wef,we,-483.667175,24.377972,-4.140918,1.188637,...,45.779007,9.964671,493.555471,1490.730341,2671.403633,3703.034379,453.064034,1287.211165,2788.178771,3570.442999
4,EN1472,wef_12484_00458512623,The song was accompanied by two music videos ...,wef_12484,wef,we,-443.509949,35.082932,2.530008,9.773186,...,36.453331,12.757771,500.340203,1744.409480,2694.012575,3729.724127,429.443855,1712.022396,2803.271911,3774.079076
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4326,EN0042,som_06592_00422956963,With all three components in place every vill...,som_06592,som,so,-332.575592,96.253632,-19.428852,23.528687,...,44.157808,7.196177,469.159100,1501.592852,2249.981369,3385.485037,443.552473,1497.377339,2205.968224,3348.351560
4327,EN1121,som_06136_01223762368,Removing birch bark from live trees is harmfu...,som_06136,som,so,-387.752228,64.548615,-7.041558,7.452435,...,42.034814,9.997102,403.423064,1457.487616,2523.495276,3196.894440,336.509621,1386.380536,2582.487143,3185.067299
4328,LN0024,som_03349_00420644955,The fastest way to Canning Town from Prince R...,som_03349,som,so,-385.490540,64.015572,6.825639,1.314303,...,16.565639,5.437319,472.811506,1305.030800,2283.955380,3223.510609,491.211943,1283.808064,2324.563606,3280.827999
4329,EN0037,som_03397_02006793154,The value of a pleasure or pain considered by...,som_03397,som,so,-357.888031,57.314758,10.759276,13.919282,...,37.719492,8.657012,416.923480,1623.392803,2545.589289,3514.070977,374.027193,1570.797725,2559.576775,3487.527229


## Model Development

In [15]:
# Load dataframe
df_full = pd.read_pickle("df_mfcc16s_and_formants.pkl")
#df_full = pd.read_pickle("df_mfcc16s_and_formants.pkl")
df_full.head(20)

Unnamed: 0,lineID,filename,transcript,speaker,source,gender,label,mfcc_01,mfcc_02,mfcc_03,...,stdevF0Hz,HNR,f1_mean,f2_mean,f3_mean,f4_mean,f1_median,f2_median,f3_median,f4_median
0,EN1223,wef_12484_01482829612,The sun provides energy,wef_12484,wef,f,we,-562.580688,24.975183,0.755791,...,35.28255,13.250868,529.795441,1637.269021,2548.379542,3541.352741,417.659197,1596.514763,2557.510779,3406.419096
1,BI0113,wef_12484_01345932698,Zoe invited you to an event,wef_12484,wef,f,we,-514.50824,15.943781,-1.808712,...,40.844751,13.191537,546.224534,1558.481714,2681.426868,3729.430666,502.368223,1452.792429,2810.206103,3582.478817
2,EN0971,wef_12484_00999757777,This is the cinematic superhero showdown you'...,wef_12484,wef,f,we,-405.116608,32.621223,-6.160967,...,42.070621,10.352992,542.049284,1808.089588,2688.133074,3706.177346,509.912337,1780.273535,2758.173458,3727.909596
3,EN0026,wef_12484_00036278823,That quick beige fox jumped in the air over e...,wef_12484,wef,f,we,-483.667175,24.377972,-4.140918,...,45.779007,9.964671,493.555471,1490.730341,2671.403633,3703.034379,453.064034,1287.211165,2788.178771,3570.442999
4,EN1472,wef_12484_00458512623,The song was accompanied by two music videos ...,wef_12484,wef,f,we,-443.509949,35.082932,2.530008,...,36.453331,12.757771,500.340203,1744.40948,2694.012575,3729.724127,429.443855,1712.022396,2803.271911,3774.079076
5,EN0005,wef_12484_00445516961,People look but no one ever finds it,wef_12484,wef,f,we,-513.143066,25.488348,-5.102324,...,34.213347,15.095316,532.271532,1568.467093,2749.893469,3657.997672,519.8132,1644.019299,2857.527498,3701.813757
6,EN1147,wef_12484_00262279005,The prints sharply contrast agricultural labo...,wef_12484,wef,f,we,-403.142761,37.218693,-4.58058,...,37.353219,11.051552,545.538458,1615.554156,2653.821615,3673.981722,471.236321,1704.268543,2783.194839,3637.039236
7,EN0002,wef_12484_00495324118,The rainbow is a division of white light into...,wef_12484,wef,f,we,-446.475189,32.38781,-1.474239,...,40.391881,11.918533,515.819649,1720.094366,2631.215946,3652.570516,441.420812,1732.978589,2793.259606,3494.125412
8,EN1324,wef_12484_01838258949,One of the evidenced passages is referenced w...,wef_12484,wef,f,we,-391.445831,34.998974,-4.933591,...,40.483473,10.082171,517.97295,1588.97421,2606.115761,3632.442898,440.703041,1595.863541,2685.533061,3665.699624
9,EN0223,wef_12484_01703308889,At least one root language is required,wef_12484,wef,f,we,-499.259735,26.725002,-1.514112,...,36.57891,15.042919,532.537254,1392.387831,2532.647481,3617.548236,445.506915,1353.237193,2620.437845,3481.332555


In [None]:
# Label for OVR comparisons

df_full['is_scottish'] = np.where(df_full['label'] == 'sc', 1, 0)
df_full['is_welsh']    = np.where(df_full['label'] == 'we', 1, 0)
df_full['is_midlands'] = np.where(df_full['label'] == 'mi', 1, 0)
df_full['is_northern'] = np.where(df_full['label'] == 'no', 1, 0)
df_full['is_southern'] = np.where(df_full['label'] == 'so', 1, 0)

 1. **Dataset must be split according to speaker ID**. Randomly splitting entries will cause most speakers to be represented in both the train and test datasets. Selecting MFCCs as features would perform extremely well in this context because MFCCs are highly correlated between utterances of any given speaker, but amounts to voice matching, and out-of-sample data/speakers would be poorly identified.

In [None]:
np.random.seed(42)

S = speaker_list_we
random.shuffle(S)
index = int(len(S)*0.8)  # for 80:20 split
speakers_we_test = S[index:]
speakers_we_train = S[:index]

S = speaker_list_mi
random.shuffle(S)
index = int(len(S)*0.8)  # for 80:20 split
speakers_mi_test = S[index:]
speakers_mi_train = S[:index]

S = speaker_list_no
random.shuffle(S)
index = int(len(S)*0.8)  # for 80:20 split
speakers_no_test = S[index:]
speakers_no_train = S[:index]

S = speaker_list_so
random.shuffle(S)
index = int(len(S)*0.8)  # for 80:20 split
speakers_so_test = S[index:]
speakers_so_train = S[:index]

S = speaker_list_sc
random.shuffle(S)
index = int(len(S)*0.8)  # for 80:20 split
speakers_sc_test = S[index:]
speakers_sc_train = S[:index]

speakers_test = speakers_we_test+speakers_mi_test+speakers_no_test+speakers_so_test+speakers_sc_test
speakers_train = speakers_we_train+speakers_mi_train+speakers_no_train+speakers_so_train+speakers_sc_train

df_train = df_full[df_full.speaker.isin(speakers_train)]
df_test = df_full[df_full.speaker.isin(speakers_test)]

print(f'Entries (speakers) in Train dataset: {df_train.shape[0]} ({len(speakers_train)})')
print(f'Entries (speakers) in Test dataset:   {df_test.shape[0]} ({len(speakers_test)})')

2. Determining the best k for KNN across all five classes:

In [None]:
from sklearn.model_selection import cross_val_score

X = df_train[colnames_mfcc16]
y_sc = df_train['is_scottish']
y_we = df_train['is_welsh']
y_mi = df_train['is_midlands']
y_no = df_train['is_northern']
y_so = df_train['is_southern']

k_range = list(range(1,31))
k_scores_sc = []
k_scores_so = []
k_scores_no = []
k_scores_we = []
k_scores_mi = []

for k in tqdm(k_range):
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X, y_sc, cv=10, scoring='f1')  # y_sc = Scottish v. rest, etc.
    k_scores_sc.append(scores.mean())
    scores = cross_val_score(knn, X, y_so, cv=10, scoring='f1')  # y_sc = Scottish v. rest, etc.
    k_scores_so.append(scores.mean())
    scores = cross_val_score(knn, X, y_no, cv=10, scoring='f1')  # y_sc = Scottish v. rest, etc.
    k_scores_no.append(scores.mean())
    scores = cross_val_score(knn, X, y_we, cv=10, scoring='f1')  # y_sc = Scottish v. rest, etc.
    k_scores_we.append(scores.mean())
    scores = cross_val_score(knn, X, y_mi, cv=10, scoring='f1')  # y_sc = Scottish v. rest, etc.
    k_scores_mi.append(scores.mean())

In [None]:
fig, ax = plt.subplots()

ax.plot(k_scores_sc, color = 'green', label = 'Scottish')
ax.plot(k_scores_we, color = 'blue', label = 'Welsh')
ax.plot(k_scores_no, color = 'red', label = 'Northern')
ax.plot(k_scores_mi, color = 'yellow', label = 'Midlands')
ax.plot(k_scores_so, color = 'black', label = 'Southern')
ax.legend(loc = 'upper right')
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated F1 Score')
plt.show()

# plt.plot(k_range, k_scores_sc)
# plt.xlabel('Value of K for KNN')
# plt.ylabel('Cross-Validated F1 Score - Scottish')

In [None]:
# Check performance of k selected in cell above

knn = KNeighborsClassifier(n_neighbors=8)
X = df_train[colnames_mfcc16]
y_sc = df_train['is_scottish']
y_we = df_train['is_welsh']
y_mi = df_train['is_midlands']
y_no = df_train['is_northern']
y_so = df_train['is_southern']

scores_sc = cross_val_score(knn, X, y_sc, cv=10, scoring='f1')
print(f'CV scores for Scottish: Mean={np.mean(scores_sc)}, \n List: {scores_sc}')
scores_we = cross_val_score(knn, X, y_we, cv=10, scoring='f1')
print(f'CV scores for Welsh:    Mean={np.mean(scores_we)}, \n List: {scores_we}')
scores_mi = cross_val_score(knn, X, y_mi, cv=10, scoring='f1')
print(f'CV scores for Midlands: Mean={np.mean(scores_mi)}, \n List: {scores_mi}')
scores_no = cross_val_score(knn, X, y_no, cv=10, scoring='f1')
print(f'CV scores for Northern: Mean={np.mean(scores_no)}, \n List: {scores_no}')
scores_so = cross_val_score(knn, X, y_so, cv=10, scoring='f1')
print(f'CV scores for Southern: Mean={np.mean(scores_so)}, \n List: {scores_so}')

mean_CV = np.mean([np.mean(scores_sc), np.mean(scores_we), np.mean(scores_mi), 
                   np.mean(scores_so), np.mean(scores_no)])

print(f'\n Mean CV across all five classes: {mean_CV}')

3. Delta-MFCCs and delta-delta-MFCCs **do not** add (much) predictive value, and even reduce F1 for some classes.

In [None]:
X_train = df_train[colnames_mfcc16]  # Compare with colnames_mfcc16_del, colnames_mfcc16_del2
y_train = df_train['label']
X_test = df_test[colnames_mfcc16]    # Compare with colnames_mfcc16_del, colnames_mfcc16_del2
y_test = df_test['label']

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

# dtc = DecisionTreeClassifier(max_depth=16)
# dtc.fit(X_train, y_train)
# y_pred = dtc.predict(X_test)

fig, ax = plt.subplots(figsize=(6,6))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, cmap='Blues', \
            fmt='g', xticklabels=labels, yticklabels=labels, ax=ax)
print(classification_report(y_test, y_pred, target_names=labels))

4. Remove Midlands due to the small number of entries available: just one female and two male speakers in the test dataset at 80:20! Reducing to four classes should make the classification problem simpler. In the same vein, perhaps only ~1/3rd of Southern entries should be included, or class weights penalized less severely. There is definitely a bias towards it as the majority class. For example:

In [None]:
knn5 = KNeighborsClassifier(n_neighbors=5)

knn5.fit(X_train, y_train)
y_pred = knn5.predict(X_test)

fig, ax = plt.subplots(figsize=(6,6))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, cmap='Blues', \
            fmt='g', xticklabels=labels, yticklabels=labels, ax=ax)
print(classification_report(y_test, y_pred, target_names=labels))

Continued in STREER.04Classification.ModelDevelopment.ipynb

### Reference: Early model performance
#### MFCCs only; Midlands class included

In [None]:
X_train = df_train[colnames_mfcc16]
y_train = df_train['label']
X_test = df_test[colnames_mfcc16]
y_test = df_test['label']

In [None]:
knn5 = KNeighborsClassifier(n_neighbors=5)

knn5.fit(X_train, y_train)
y_pred = knn5.predict(X_test)

fig, ax = plt.subplots(figsize=(6,6))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, cmap='Blues', \
            fmt='g', xticklabels=labels, yticklabels=labels, ax=ax)
print(classification_report(y_test, y_pred, target_names=labels))

In [None]:
logit = LogisticRegression(solver='liblinear')

logit.fit(X_train, y_train)
y_pred = logit.predict(X_test)

fig, ax = plt.subplots(figsize=(6,6))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, cmap='Blues', \
            fmt='g', xticklabels=labels, yticklabels=labels, ax=ax)
print(classification_report(y_test, y_pred, target_names=labels))

In [None]:
logit = LogisticRegression(solver='liblinear', class_weight='balanced')

logit.fit(X_train, y_train)
y_pred = logit.predict(X_test)

fig, ax = plt.subplots(figsize=(6,6))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, cmap='Blues', \
            fmt='g', xticklabels=labels, yticklabels=labels, ax=ax)
print(classification_report(y_test, y_pred, target_names=labels))

In [None]:
dt = DecisionTreeClassifier(max_depth=16)

dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)

# Check predictions
fig, ax = plt.subplots(figsize=(6,6))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, cmap='Blues', \
            fmt='g', xticklabels=labels, yticklabels=labels, ax=ax)
print(classification_report(y_test, y_pred, target_names=labels))

In [None]:
rf100 = RandomForestClassifier(n_estimators=100, class_weight='balanced')

rf100.fit(X_train, y_train)
y_pred = rf100.predict(X_test)

# Check predictions
fig, ax = plt.subplots(figsize=(6,6))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, cmap='Blues', \
            fmt='g', xticklabels=labels, yticklabels=labels, ax=ax)
print(classification_report(y_test, y_pred, target_names=labels))

### Reference: Multi-class ROC-AUC

In [None]:
def roc_auc_score_multiclass(actual_class, pred_class, average = "macro"):

  #creating a set of all the unique classes using the actual class list
  unique_class = set(actual_class)
  roc_auc_dict = {}
  for per_class in unique_class:
    #creating a list of all the classes except the current class 
    other_class = [x for x in unique_class if x != per_class]

    #marking the current class as 1 and all other classes as 0
    new_actual_class = [0 if x in other_class else 1 for x in actual_class]
    new_pred_class = [0 if x in other_class else 1 for x in pred_class]

    #using the sklearn metrics method to calculate the roc_auc_score
    roc_auc = roc_auc_score(new_actual_class, new_pred_class, average = average)
    roc_auc_dict[per_class] = roc_auc

  return roc_auc_dict

In [None]:
from sklearn.utils import shuffle
X_s, y_s = shuffle(X, y)
cross_val_score(knn, X_s, y_s, cv=3, scoring="roc_auc")

In [None]:
print(f1_score(y_test, y_pred, average='weighted'))
print(roc_auc_score_multiclass(y_test, y_pred, average='weighted'))

### Reference: Audio processing of a single file

In [None]:
# 1. Generate path
path = 'midlands_english_female/' + 'mif_02484_00047480027' + '.wav'

# 2. Import audio file
y, sr = librosa.load(path, sr=22050)

In [None]:
# 3. Calculate MFCCs
# 3.1 Import wav file

path = 'midlands_english_female/mif_02484_00047480027.wav'
y, sr = librosa.load(path, sr=22050)  # 'data are stored at 48 kHz' in publication
ipd.Audio(path)

In [None]:
# 3.2 Plot speech waveform

plt.figure(figsize=(14, 5))
librosa.display.waveplot(y, sr=sr)

In [None]:
# 3.3 Calcuate MFCCs

mfccs = librosa.feature.mfcc(y=y, sr=sr)
#mfccs = preprocessing.scale(mfccs, axis=1)  # for use in MVP fig only!

In [None]:
# 3.4 Plot spectrogram

D = librosa.stft(y)  # STFT of y
S_db = librosa.amplitude_to_db(np.abs(D), ref=np.max)

fig, ax = plt.subplots()
img = librosa.display.specshow(S_db, x_axis='time', y_axis='linear', ax=ax)
ax.set(title='Spectrogram')
fig.colorbar(img, ax=ax, format="%+2.f dB")
ipd.Audio(path)

In [None]:
# 3.5 Plot MFCC spectra

fig, ax = plt.subplots()
img = librosa.display.specshow(mfccs, x_axis='time', y_axis='linear', ax=ax)
fig.colorbar(img, ax=ax)
ax.set(title='MFCCs (unscaled)')