Import packages

In [55]:
import pandas as pd
import numpy as np
import seaborn as sns
import dython as dy
from os.path import join
import scipy.io.wavfile as wav
import matplotlib.pyplot as plt
from sklearn.preprocessing import normalize
from sklearn.model_selection import KFold, ParameterGrid
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split
import csv

Spectrometer function

In [52]:
def spect(arr, fs, n_sub_r,n_sub_c):
    spect_list = []
    for i in range(0,len(arr)):
        spectrum, freq, time, im = plt.specgram(x=arr[i], Fs=fs[i],scale='dB')
        spect_split1 = np.array_split(spectrum, n_sub_r, axis=0)
        tmp_row = []
        for row in spect_split1:
            spect_split2 = np.array_split(row, n_sub_c, axis=1)
            for block in spect_split2:
                tmp_row.append(block.mean())
        spect_list.append(tmp_row)
    print(np.array(spect_list).shape)
    return spect_list

In [53]:
def data_import(path,par='dev'):
    df_in = pd.read_csv(path)

    X = []
    y = []
    rate_list = []

    for item in df_in.iterrows():
        filename = item[1]['path']
        rate, data = wav.read(filename)
        rate_list.append(rate)
        X.append(data)
        if par=='dev':
            label = item[1]['action']+item[1]['object']
            y.append(label)

    if par=='dev':
        return X,y,rate_list
    else:
        return X,rate_list

Import data

In [None]:
df_dev = pd.read_csv('development.csv')
df_dev = df_dev.rename(columns={'Self-reported fluency level ': 'fluency', 'First Language spoken': 'firstLan', 'Current language used for work/school': 'currentLan'})

df_dev['fluency'] = df_dev['fluency'].astype('category')
df_dev['firstLan'] = df_dev['firstLan'].astype('category')
df_dev['currentLan'] = df_dev['currentLan'].astype('category')
df_dev['gender'] = df_dev['gender'].astype('category')
df_dev['ageRange'] = df_dev['ageRange'].astype('category')
print(df_dev.info())
print(df_dev['fluency'].unique())
print(df_dev['firstLan'].unique())
print(df_dev['currentLan'].unique())
print(df_dev['ageRange'].unique())


print(df_dev['firstLan'].cat.codes.corr(df_dev['currentLan'].cat.codes))
print(df_dev['fluency'].cat.codes.corr(df_dev['firstLan'].cat.codes))
print(df_dev['fluency'].cat.codes.corr(df_dev['currentLan'].cat.codes))

df_corr = df_dev[['firstLan','fluency','currentLan','ageRange','gender']]
print(df_corr.info())
df_corr['fluency'] = df_corr['fluency'].cat.codes
df_corr['firstLan'] = df_corr['firstLan'].cat.codes
df_corr['currentLan'] = df_corr['currentLan'].cat.codes
df_corr['gender'] = df_corr['gender'].cat.codes
df_corr['ageRange'] = df_corr['ageRange'].cat.codes

sns.heatmap(df_corr.corr(),vmin=-1, vmax=1, cmap='BrBG', annot=True)


In [54]:
X_dev_list,y_dev_list,rate_dev = data_import('development.csv','dev')
X_ev_list,rate_ev = data_import('evaluation.csv','ev')

print(len(X_dev_list))
print(len(X_ev_list))

9854
1455


Pre-processing

In [None]:
n_sub_row = 10
n_sub_col = 5

spectX_dev_list = spect(X_dev_list,rate_dev, n_sub_row, n_sub_col)
spectX_ev_list = spect(X_ev_list, rate_ev, n_sub_row, n_sub_col)

y_dev = pd.DataFrame(np.array(y_dev_list).transpose())
X_dev = pd.DataFrame(spectX_dev_list).fillna(0)
X_dev = normalize(X_dev)
X_ev = pd.DataFrame(spectX_ev_list).fillna(0)
X_ev = normalize(X_ev)


In [None]:
X_train_valid, X_test, y_train_valid, y_test = train_test_split(X_dev, y_dev, test_size=0.2)

Grid-search

In [None]:
params = {
    "n_estimators": [50, 100, 200, 500],
    "criterion": ["gini", "entropy"]
}

n = 5
kf = KFold(n)
for config in ParameterGrid(params):
    sum1 = 0
    sum2 = 0
    for train_indices, validation_indices in kf.split(X_train_valid):
        X_train = X_train_valid[train_indices]
        X_valid = X_train_valid[validation_indices]
        y_train = (y_train_valid.values.ravel())[train_indices]
        y_valid = (y_train_valid.values.ravel())[validation_indices]
        rf = RandomForestClassifier(**config)
        rf.fit(X_train, y_train)
        y_prev = rf.predict(X_valid)
        score1 = f1_score(y_true=y_valid, y_pred=y_prev,average='macro')
        score2 = accuracy_score(y_true=y_valid, y_pred=y_prev)
        sum1 += score1
        sum2 += score2
    print(float(sum1 / n), float(sum2 / n), " -> ", config)


Test

In [None]:
rf = RandomForestClassifier(n_estimators=500, criterion='entropy')
rf.fit(X_train_valid, y_train_valid.values.ravel())
y_prev = rf.predict(X_test)
score1 = f1_score(y_true=y_test, y_pred=y_prev, average='macro')
score2 = accuracy_score(y_true=y_test, y_pred=y_prev)

Evaluation

In [None]:
rf = RandomForestClassifier(n_estimators=500, criterion='entropy')
rf.fit(X_dev, y_dev.values.ravel())
y_prev_ev = rf.predict(X_ev)

Generate results

In [None]:
with open('res.csv', mode='w', newline='') as file:
    file_writer = csv.writer(file)
    i = 0
    file_writer.writerow(['Id', 'Predicted'])
    for elem in y_prev_ev:
        file_writer.writerow([i, elem])
        i += 1