In [None]:
import pandas as pd
import numpy as np

import librosa
import librosa.display

from keras import layers
from keras.models import Sequential

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import classification_report

import os
import warnings

warnings.filterwarnings('ignore')

from IPython.display import HTML, display
import time

In [None]:
def progress(value, max=100):
    return HTML("""
        <progress
            value='{value}'
            max='{max}',
            style='width: 100%'
        >
            {value}
        </progress>
        <a>{value}</a>
    """.format(value=value, max=max))

In [None]:
data = pd.read_csv('/kaggle/input/speech-accent-archive/speakers_all.csv', index_col='speakerid')
data.drop(data.columns[8:12],axis = 1, inplace = True)
data = data.fillna('NaN')

In [None]:
data.shape[0]

In [None]:
%%time
DataSet = pd.DataFrame()
tmp = {}
progressBar = display(progress(0, 100), display_id=True)

for index, row in data.iterrows():
    if os.path.isfile('/kaggle/input/speech-accent-archive/recordings/recordings/' + row['filename'] +'.mp3') == False:
        print('file ' + str(row['filename']) + '.mp3 doesnt exists')
        data = data.drop([index])
        continue

    tmp['filename'] = row['filename']
    tmp['country'] = row['country']

    y, sr = librosa.load('/kaggle/input/speech-accent-archive/recordings/recordings/' + row['filename'] +'.mp3')
    tmp['rms'] = np.mean(librosa.feature.rms(y=y))
    tmp['chroma_stft'] = np.mean(librosa.feature.chroma_stft(y=y, sr=sr))
    tmp['spec_cent'] = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
    tmp['spec_bw'] = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr))
    tmp['rolloff'] = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr))
    tmp['zcr'] = np.mean(librosa.feature.zero_crossing_rate(y))
    mfcc = librosa.feature.mfcc(y=y, sr=sr)
    i = 0
    for e in mfcc:
        tmp['mfcc'+str(i)] = np.mean(e)
        i += 1
    DataSet = DataSet.append([tmp])
    progressBar.update(progress(DataSet.shape[0]/data.shape[0]*100, 100))

In [None]:
#os.chdir("/kaggle/working/your_folder_name")
DataSet.to_csv('/kaggle/working/dataset.csv')

In [None]:
data_to_fit = pd.read_csv('/kaggle/working/dataset.csv')

In [None]:
freq = data_to_fit['country'].value_counts()
frequent_values = freq[freq >= 35].index
data_to_fit = data_to_fit[data_to_fit['country'].isin(frequent_values)]


data_to_fit = data_to_fit.drop(data_to_fit[data_to_fit['country'] == 'usa'].sample(frac=.90).index)
data_to_fit = data_to_fit.drop(data_to_fit[data_to_fit['country'] == 'china'].sample(frac=.45).index)
data_to_fit = data_to_fit.drop(data_to_fit[data_to_fit['country'] == 'india'].sample(frac=.4).index)
data_to_fit = data_to_fit.drop(data_to_fit[data_to_fit['country'] == 'canada'].sample(frac=.1).index)
data_to_fit = data_to_fit.drop(data_to_fit[data_to_fit['country'] == 'south korea'].sample(frac=.1).index)
data_to_fit = data_to_fit.drop(data_to_fit[data_to_fit['country'] == 'uk'].sample(frac=.4).index)
#data = data.loc[((data['country'] == 'usa') | (data['country'] == 'china')) & (data['age_onset'] != 0)]
data_to_fit['country'].value_counts()

In [None]:
le = preprocessing.LabelEncoder()
data_to_fit['country'] = le.fit_transform(data_to_fit['country'].astype(str))

In [None]:
(data_to_fit['country'].unique())

In [None]:
joja = pd.Series(le.inverse_transform(data_to_fit['country']))

joja.unique()['belgium', 'brazil','canada','india', 'china','south korea','turkey', 'uk','usa']

In [None]:
Xtrn, Xtst, Ytrn, Ytst = train_test_split(data_to_fit.drop(columns=['country', 'filename']), data_to_fit['country'], test_size=0.2)

In [None]:
%%time
model = Sequential()
model.add(layers.Dense(256, activation='relu', input_shape=(Xtrn.shape[1],)))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(10, activation='softmax'))
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
classifier = model.fit(Xtrn,
                    Ytrn,
                    epochs=1000,
                    batch_size=128)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(Ytst, model.predict_classes(Xtst), target_names=['belgium', 'brazil','canada','india', 'china','south korea','turkey', 'uk','usa']))