In [1]:
import librosa
import numpy as np

indir = 'chunks/' # already VAD
from os import listdir
from os.path import isfile, join
mypath = 'chunks/'
flist = [f for f in listdir(indir) if isfile(join(indir, f))]
print(len(flist), flist[0])

2289 DR1-MCPM0-SA1-00.wav


In [2]:
def get_attributes(fname):
    attr = fname.split('.')[0].split('-')
    dialect = attr[0]
    gender = attr[1][0]
    speaker_id = attr[1]
    sentence_type = attr[2][:2]
    return dialect, gender, speaker_id, sentence_type

In [3]:
train = {}
test = {}

for fname in flist:
    input_path = indir + fname
    y, sr = librosa.load(input_path, sr=None) # set sr=None for orig file sr otherwise it is converted to ~22K

    # scaling the maximum of absolute amplitude to 1
    processed_data = y/max(abs(y))
    
    # TODO: calc VAD (already done)
    
    # https://groups.google.com/forum/#!topic/librosa/V4Z1HpTKn8Q
    mfcc = librosa.feature.mfcc(y=processed_data, sr=sr, n_mfcc=13, n_fft=(25*sr)//1000, hop_length=(10*sr)//1000)
    mfcc[0] = librosa.feature.rmse(processed_data, hop_length=int(0.010*sr), n_fft=int(0.025*sr)) 
    mfcc_delta = librosa.feature.delta(mfcc)
    mfcc_delta2 = librosa.feature.delta(mfcc, order=2)
    features = np.vstack([mfcc, mfcc_delta, mfcc_delta2]) 
    
    # split train test
    dialect, gender, speaker_id, sentence_type = get_attributes(fname)
    if sentence_type == 'SA':
        test.setdefault(speaker_id, []).append(features)
    else:
        train.setdefault(speaker_id, []).append(features)

In [4]:
ids = list(test.keys())
ids.sort()
print(ids)

idx = {}
for i in range(len(ids)):
    idx[ids[i]] = i # TODO: for MATLAB set i+1 (i.e 1 to 200)
print(idx)

['MADC0', 'MAEB0', 'MAKB0', 'MAKR0', 'MAPV0', 'MARC0', 'MARW0', 'MBEF0', 'MBGT0', 'MBJV0', 'MBMA0', 'MBWP0', 'MCAL0', 'MCDC0', 'MCDD0', 'MCDR0', 'MCEF0', 'MCEW0', 'MCHL0', 'MCLM0', 'MCPM0', 'MCSS0', 'MCTM0', 'MDAC0', 'MDAS0', 'MDBB1', 'MDBP0', 'MDCD0', 'MDDC0', 'MDEF0', 'MDEM0', 'MDHL0', 'MDHS0', 'MDJM0', 'MDLB0', 'MDLC0', 'MDLC2', 'MDLH0', 'MDMA0', 'MDMT0', 'MDNS0', 'MDPK0', 'MDPS0', 'MDSJ0', 'MDSS0', 'MDSS1', 'MDTB0', 'MDWD0', 'MDWH0', 'MDWM0', 'MEDR0', 'MEFG0', 'MEGJ0', 'MESG0', 'MEWM0', 'MFER0', 'MFMC0', 'MFRM0', 'MFWK0', 'MGAF0', 'MGAG0', 'MGES0', 'MGJC0', 'MGRL0', 'MGRP0', 'MGSH0', 'MGXP0', 'MHIT0', 'MHJB0', 'MHMG0', 'MHMR0', 'MHRM0', 'MILB0', 'MJAC0', 'MJAE0', 'MJBG0', 'MJDA0', 'MJDC0', 'MJDE0', 'MJEB0', 'MJEB1', 'MJEE0', 'MJHI0', 'MJJB0', 'MJJJ0', 'MJKR0', 'MJLB0', 'MJLG1', 'MJLS0', 'MJMA0', 'MJMD0', 'MJMM0', 'MJPM0', 'MJPM1', 'MJRH0', 'MJRH1', 'MJRP0', 'MJSR0', 'MJWS0', 'MJWT0', 'MJXL0', 'MKAH0', 'MKAJ0', 'MKAM0', 'MKDT0', 'MKJO0', 'MKLS0', 'MKLS1', 'MKLW0', 'MKXL0', 'MLBC0', 

In [5]:
def concat(x, win_size=10, hop_size=3):
    r, c = x.shape
    y = []
    for i in range(0, c, hop_size):
        if i + win_size > c:
            break
        y.append(x[:, i:i + win_size].T.flatten())
    return np.array(y)

In [6]:
X_test = []
Y_test = []

from sklearn import preprocessing

# mvn of test
for speaker_id, feature_list in test.items():
    speaker_id = idx[speaker_id]
    for features in feature_list:
        features = preprocessing.scale(features.T).T
        frames = concat(features)
        for frame in frames:
            X_test.append(frame)
            Y_test.append(speaker_id)
            
X_test = np.array(X_test)
Y_test = np.array(Y_test)

In [7]:
X_train = []
Y_train = []

# smvn of train
for speaker_id, feature_list in train.items():
    speaker_id = idx[speaker_id]
    
    # calc speaker level mean and std
    data = []
    for features in feature_list:
        frames = features.T
        for frame in frames:
            data.append(frame)
    data = np.array(data)
    mean = data.mean(axis=0)
    std = data.std(axis=0)
    
    # speaker level normalize
    for features in feature_list:
        features = ((features.T - mean)/std).T
        frames = concat(features)
        for frame in frames:
            X_train.append(frame)
            Y_train.append(speaker_id)
            
X_train = np.array(X_train)
Y_train = np.array(Y_train)

In [8]:
print(X_train.shape, X_test.shape)
print(Y_train.shape, Y_test.shape)

(137520, 390) (36091, 390)
(137520,) (36091,)


In [9]:
# shuffle training data
from sklearn.utils import shuffle
X_train, Y_train = shuffle(X_train, Y_train)

In [11]:
from sklearn.neural_network import MLPClassifier
# mlp = MLPClassifier(hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
#                     solver='sgd', verbose=10, tol=1e-4, random_state=1)
mlp = MLPClassifier(hidden_layer_sizes=(200,), max_iter=1000, alpha=1e-4,
                    solver='sgd', verbose=10, tol=1e-4, random_state=1,
                    learning_rate_init=.01, learning_rate='adaptive',
                    warm_start=True)

mlp.fit(X_train, Y_train)
print("Training set score: %f" % mlp.score(X_train, Y_train))
print("Test set score: %f" % mlp.score(X_test, Y_test))

Iteration 1, loss = 5.33479681
Iteration 2, loss = 5.02862733
Iteration 3, loss = 4.76368955
Iteration 4, loss = 4.55939396
Iteration 5, loss = 4.38443331
Iteration 6, loss = 4.23295709
Iteration 7, loss = 4.10110612
Iteration 8, loss = 3.98623488
Iteration 9, loss = 3.88781922
Iteration 10, loss = 3.79958155
Iteration 11, loss = 3.72265799
Iteration 12, loss = 3.65547699
Iteration 13, loss = 3.59621347
Iteration 14, loss = 3.54229991
Iteration 15, loss = 3.49344509
Iteration 16, loss = 3.45086983
Iteration 17, loss = 3.41007972
Iteration 18, loss = 3.37302821
Iteration 19, loss = 3.33829433
Iteration 20, loss = 3.30578820
Iteration 21, loss = 3.27558155
Iteration 22, loss = 3.24771529
Iteration 23, loss = 3.22157143
Iteration 24, loss = 3.19363001
Iteration 25, loss = 3.16971427
Iteration 26, loss = 3.14725079
Iteration 27, loss = 3.12507760
Iteration 28, loss = 3.10336494
Iteration 29, loss = 3.08484877
Iteration 30, loss = 3.06571020
Iteration 31, loss = 3.04344551
Iteration 32, los

Iteration 253, loss = 0.64151141
Iteration 254, loss = 0.63448638
Iteration 255, loss = 0.62910988
Iteration 256, loss = 0.62652049
Iteration 257, loss = 0.61842762
Iteration 258, loss = 0.61333833
Iteration 259, loss = 0.60690125
Iteration 260, loss = 0.60313495
Iteration 261, loss = 0.59728118
Iteration 262, loss = 0.59204846
Iteration 263, loss = 0.58566073
Iteration 264, loss = 0.58086823
Iteration 265, loss = 0.57616199
Iteration 266, loss = 0.57135042
Iteration 267, loss = 0.56530148
Iteration 268, loss = 0.56007698
Iteration 269, loss = 0.55609134
Iteration 270, loss = 0.55149760
Iteration 271, loss = 0.54760554
Iteration 272, loss = 0.54102459
Iteration 273, loss = 0.53677334
Iteration 274, loss = 0.53243936
Iteration 275, loss = 0.52863505
Iteration 276, loss = 0.52289378
Iteration 277, loss = 0.51753811
Iteration 278, loss = 0.51535562
Iteration 279, loss = 0.51028957
Iteration 280, loss = 0.50741809
Iteration 281, loss = 0.50178407
Iteration 282, loss = 0.49587421
Iteration 

Iteration 498, loss = 0.08396361
Iteration 499, loss = 0.08303653
Iteration 500, loss = 0.08254243
Iteration 501, loss = 0.08217942
Iteration 502, loss = 0.08188091
Iteration 503, loss = 0.08169943
Iteration 504, loss = 0.08154065
Iteration 505, loss = 0.08144026
Iteration 506, loss = 0.08125897
Iteration 507, loss = 0.08113643
Iteration 508, loss = 0.08107832
Iteration 509, loss = 0.08096414
Iteration 510, loss = 0.08088313
Iteration 511, loss = 0.08066396
Iteration 512, loss = 0.08053013
Iteration 513, loss = 0.08049206
Iteration 514, loss = 0.08039910
Iteration 515, loss = 0.08032497
Training loss did not improve more than tol=0.000100 for two consecutive epochs. Setting learning rate to 0.000400
Iteration 516, loss = 0.07732643
Iteration 517, loss = 0.07692767
Iteration 518, loss = 0.07685091
Iteration 519, loss = 0.07680810
Iteration 520, loss = 0.07680314
Training loss did not improve more than tol=0.000100 for two consecutive epochs. Setting learning rate to 0.000080
Iteration 5

In [12]:
np.save('coefs_smvn_train_mvn_test', mlp.coefs_)
np.save('intercepts_smvn_train_mvn_test', mlp.intercepts_)

In [None]:
X_test2 = X_test
Y_test2 = Y_test

In [25]:
X_test = []
Y_test = []

# smvn of train
for speaker_id_str, feature_list in train.items():
    data = []
    speaker_id = idx[speaker_id_str]
    # calc speaker level mean and std
    for features in feature_list:
        frames = features.T
        for frame in frames:
            data.append(frame)
    data = np.array(data)
    mean = data.mean(axis=0)
    std = data.std(axis=0)

    # s level normalize
    for features in test[speaker_id_str]:
        features = ((features.T - mean)/std).T
        frames = concat(features)
        for frame in frames:
            X_test.append(frame)
            Y_test.append(speaker_id)
            
X_test = np.array(X_test)
Y_test = np.array(Y_test)

In [26]:
print("Test set score: %f" % mlp.score(X_test, Y_test))

Test set score: 0.707323


In [28]:
# segment acc
from scipy import stats

y_true = []
y_pred = []
# test

for speaker_id_str, feature_list in train.items():
    data = []
    speaker_id = idx[speaker_id_str]
    # calc speaker level mean and std
    for features in feature_list:
        frames = features.T
        for frame in frames:
            data.append(frame)
    data = np.array(data)
    mean = data.mean(axis=0)
    std = data.std(axis=0)

    # s level normalize
    for features in test[speaker_id_str]:
        features = ((features.T - mean)/std).T
        frames = concat(features)
        x = []
        frames = concat(features)
        for frame in frames:
            x.append(frame)
        pred = stats.mode(mlp.predict(x)).mode[0]
        y_true.append(speaker_id)
        y_pred.append(pred)
print(sum(np.array(y_true) == np.array(y_pred))/len(y_true))

1.0
