In [1]:
import librosa
import numpy as np

indir = 'chunks/' # already VAD
from os import listdir
from os.path import isfile, join
mypath = 'chunks/'
flist = [f for f in listdir(indir) if isfile(join(indir, f))]
print(len(flist), flist[0])

2289 DR1-MCPM0-SA1-00.wav


In [2]:
def get_attributes(fname):
    attr = fname.split('.')[0].split('-')
    dialect = attr[0]
    gender = attr[1][0]
    speaker_id = attr[1]
    sentence_type = attr[2][:2]
    return dialect, gender, speaker_id, sentence_type

In [3]:
print(get_attributes('DR1-MCPM0-SA1-00.wav'))

('DR1', 'M', 'MCPM0', 'SA')


In [15]:
train = {}
test = {}

for fname in flist:
    input_path = indir + fname
    y, sr = librosa.load(input_path, sr=None) # set sr=None for orig file sr otherwise it is converted to ~22K

    # scaling the maximum of absolute amplitude to 1
    processed_data = y/max(abs(y))
    
    # TODO: calc VAD
    
    # https://groups.google.com/forum/#!topic/librosa/V4Z1HpTKn8Q
    mfcc = librosa.feature.mfcc(y=processed_data, sr=sr, n_mfcc=13, n_fft=(25*sr)//1000, hop_length=(10*sr)//1000)
    mfcc[0] = librosa.feature.rmse(processed_data, hop_length=int(0.010*sr), n_fft=int(0.025*sr)) 
    mfcc_delta = librosa.feature.delta(mfcc)
    mfcc_delta2 = librosa.feature.delta(mfcc, order=2)
    features = np.vstack([mfcc, mfcc_delta, mfcc_delta2]) 
    
    # split train test
    dialect, gender, speaker_id, sentence_type = get_attributes(fname)
    if sentence_type == 'SA':
        test.setdefault(speaker_id, []).append(features)
    else:
        train.setdefault(speaker_id, []).append(features)

In [16]:
ids = list(test.keys())
ids.sort()
print(ids)

idx = {}
for i in range(len(ids)):
    idx[ids[i]] = i # TODO: for MATLAB set i+1 (i.e 1 to 200)
print(idx)

['MADC0', 'MAEB0', 'MAKB0', 'MAKR0', 'MAPV0', 'MARC0', 'MARW0', 'MBEF0', 'MBGT0', 'MBJV0', 'MBMA0', 'MBWP0', 'MCAL0', 'MCDC0', 'MCDD0', 'MCDR0', 'MCEF0', 'MCEW0', 'MCHL0', 'MCLM0', 'MCPM0', 'MCSS0', 'MCTM0', 'MDAC0', 'MDAS0', 'MDBB1', 'MDBP0', 'MDCD0', 'MDDC0', 'MDEF0', 'MDEM0', 'MDHL0', 'MDHS0', 'MDJM0', 'MDLB0', 'MDLC0', 'MDLC2', 'MDLH0', 'MDMA0', 'MDMT0', 'MDNS0', 'MDPK0', 'MDPS0', 'MDSJ0', 'MDSS0', 'MDSS1', 'MDTB0', 'MDWD0', 'MDWH0', 'MDWM0', 'MEDR0', 'MEFG0', 'MEGJ0', 'MESG0', 'MEWM0', 'MFER0', 'MFMC0', 'MFRM0', 'MFWK0', 'MGAF0', 'MGAG0', 'MGES0', 'MGJC0', 'MGRL0', 'MGRP0', 'MGSH0', 'MGXP0', 'MHIT0', 'MHJB0', 'MHMG0', 'MHMR0', 'MHRM0', 'MILB0', 'MJAC0', 'MJAE0', 'MJBG0', 'MJDA0', 'MJDC0', 'MJDE0', 'MJEB0', 'MJEB1', 'MJEE0', 'MJHI0', 'MJJB0', 'MJJJ0', 'MJKR0', 'MJLB0', 'MJLG1', 'MJLS0', 'MJMA0', 'MJMD0', 'MJMM0', 'MJPM0', 'MJPM1', 'MJRH0', 'MJRH1', 'MJRP0', 'MJSR0', 'MJWS0', 'MJWT0', 'MJXL0', 'MKAH0', 'MKAJ0', 'MKAM0', 'MKDT0', 'MKJO0', 'MKLS0', 'MKLS1', 'MKLW0', 'MKXL0', 'MLBC0', 

In [17]:
def concat(x, win_size=10, hop_size=3):
    r, c = x.shape
    y = []
    for i in range(0, c, hop_size):
        if i + win_size > c:
            break
        y.append(x[:, i:i + win_size].T.flatten())
    return np.array(y)

In [18]:
X_test = []
Y_test = []

# mvn of test
for speaker_id, feature_list in test.items():
    speaker_id = idx[speaker_id]
    for features in feature_list:
        features = (features - features.mean())/features.std()
        frames = concat(features)
        for frame in frames:
            X_test.append(frame)
            Y_test.append(speaker_id)
            
X_test = np.array(X_test)
Y_test = np.array(Y_test)

In [19]:
X_train = []
Y_train = []

# smvn of train
for speaker_id, feature_list in train.items():
    speaker_id = idx[speaker_id]
    
    # calc speaker level mean and std
    data = []
    for features in feature_list:
        frames = features.T
        for frame in frames:
            data.append(frame)
    data = np.array(data)
    mean = data.mean()
    std = data.std()
    
    # speaker level normalize
    for features in feature_list:
        features = (features - mean)/std
        frames = concat(features)
        for frame in frames:
            X_train.append(frame)
            Y_train.append(speaker_id)
            
X_train = np.array(X_train)
Y_train = np.array(Y_train)

In [20]:
print(X_train.shape, X_test.shape)

(137520, 390) (36091, 390)


In [21]:
print(Y_train.shape, Y_test.shape)

(137520,) (36091,)


In [22]:
# shuffle training data
from sklearn.utils import shuffle
X_train, Y_train = shuffle(X_train, Y_train)

In [23]:
#import scipy.io as spio
#spio.savemat('dataset.mat', dict(X=X_train, y=Y_train, X_test=X_test, Y_test=Y_test))

In [46]:
from sklearn.neural_network import MLPClassifier
# mlp = MLPClassifier(hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
#                     solver='sgd', verbose=10, tol=1e-4, random_state=1)
mlp = MLPClassifier(hidden_layer_sizes=(200,), max_iter=250, alpha=1e-4,
                    solver='sgd', verbose=10, tol=1e-4, random_state=1,
                    learning_rate_init=.01)

mlp.fit(X_train, Y_train)
print("Training set score: %f" % mlp.score(X_train, Y_train))
print("Test set score: %f" % mlp.score(X_test, Y_test))

Iteration 1, loss = 4.80127068
Iteration 2, loss = 3.98026314
Iteration 3, loss = 3.48712526
Iteration 4, loss = 3.18047052
Iteration 5, loss = 2.95978835
Iteration 6, loss = 2.78143188
Iteration 7, loss = 2.63496096
Iteration 8, loss = 2.50725815
Iteration 9, loss = 2.39880453
Iteration 10, loss = 2.30204915
Iteration 11, loss = 2.21552836
Iteration 12, loss = 2.13971171
Iteration 13, loss = 2.06951789
Iteration 14, loss = 2.00561584
Iteration 15, loss = 1.94839300
Iteration 16, loss = 1.89491911
Iteration 17, loss = 1.84475175
Iteration 18, loss = 1.79838973
Iteration 19, loss = 1.75584822
Iteration 20, loss = 1.71518395
Iteration 21, loss = 1.67821373
Iteration 22, loss = 1.64476982
Iteration 23, loss = 1.61129456
Iteration 24, loss = 1.58070895
Iteration 25, loss = 1.55154822
Iteration 26, loss = 1.52238422
Iteration 27, loss = 1.49775597
Iteration 28, loss = 1.47268344
Iteration 29, loss = 1.44909761
Iteration 30, loss = 1.42657174
Iteration 31, loss = 1.40567778
Iteration 32, los



Training set score: 0.855432
Test set score: 0.234518


In [59]:
mlp2 = MLPClassifier(hidden_layer_sizes=(200,), max_iter=50, alpha=1e-4,
                    solver='sgd', verbose=10, tol=1e-4, random_state=1,
                    learning_rate_init=.005, warm_start=True)

# _initialize
mlp2.n_iter_ = 0
mlp2.t_ = 0
mlp2.n_outputs_ = mlp.n_outputs_

# Compute the number of layers
mlp2.n_layers_ = mlp.n_layers_

# Output for multi class
mlp2.out_activation_ = mlp.out_activation_

# Initialize coefficient and intercept layers
mlp2.coefs_ = mlp.coefs_
mlp2.intercepts_ = mlp.intercepts_

# self.coefs_ = []
# self.intercepts_ = []

# for i in range(self.n_layers_ - 1):
#     coef_init, intercept_init = self._init_coef(layer_units[i],
#                                                 layer_units[i + 1])
#     self.coefs_.append(coef_init)
#     self.intercepts_.append(intercept_init)


mlp2.loss_curve_ = []
mlp2._no_improvement_count = 0
if mlp2.early_stopping:
    mlp2.validation_scores_ = []
    mlp2.best_validation_score_ = -np.inf
else:
    mlp2.best_loss_ = np.inf

mlp2.fit(X_train, Y_train)
print("Training set score: %f" % mlp2.score(X_train, Y_train))
print("Test set score: %f" % mlp2.score(X_test, Y_test))

Iteration 1, loss = 0.42084375
Iteration 2, loss = 0.41735282
Iteration 3, loss = 0.41710122
Iteration 4, loss = 0.41580144
Iteration 5, loss = 0.41462085
Iteration 6, loss = 0.41449071
Iteration 7, loss = 0.41279013
Iteration 8, loss = 0.41316098
Iteration 9, loss = 0.41211435
Iteration 10, loss = 0.41102799
Iteration 11, loss = 0.41045000
Iteration 12, loss = 0.40972863
Iteration 13, loss = 0.40917248
Iteration 14, loss = 0.40845507
Iteration 15, loss = 0.40751720
Iteration 16, loss = 0.40703911
Iteration 17, loss = 0.40587769
Iteration 18, loss = 0.40584536
Iteration 19, loss = 0.40546943
Iteration 20, loss = 0.40436155
Iteration 21, loss = 0.40371829
Iteration 22, loss = 0.40340978
Iteration 23, loss = 0.40292178
Iteration 24, loss = 0.40235717
Iteration 25, loss = 0.40191976
Iteration 26, loss = 0.40017925
Iteration 27, loss = 0.40077131
Iteration 28, loss = 0.39950126
Iteration 29, loss = 0.39855091
Iteration 30, loss = 0.39896559
Iteration 31, loss = 0.39883502
Iteration 32, los



Training set score: 0.895797
Test set score: 0.237455


In [60]:
mlp=mlp2
mlp2 = MLPClassifier(hidden_layer_sizes=(200,), max_iter=50, alpha=1e-4,
                    solver='sgd', verbose=10, tol=1e-4, random_state=1,
                    learning_rate_init=.005, warm_start=True, learning_rate='adaptive')

# _initialize
mlp2.n_iter_ = 0
mlp2.t_ = 0
mlp2.n_outputs_ = mlp.n_outputs_

# Compute the number of layers
mlp2.n_layers_ = mlp.n_layers_

# Output for multi class
mlp2.out_activation_ = mlp.out_activation_

# Initialize coefficient and intercept layers
mlp2.coefs_ = mlp.coefs_
mlp2.intercepts_ = mlp.intercepts_

# self.coefs_ = []
# self.intercepts_ = []

# for i in range(self.n_layers_ - 1):
#     coef_init, intercept_init = self._init_coef(layer_units[i],
#                                                 layer_units[i + 1])
#     self.coefs_.append(coef_init)
#     self.intercepts_.append(intercept_init)


mlp2.loss_curve_ = []
mlp2._no_improvement_count = 0
if mlp2.early_stopping:
    mlp2.validation_scores_ = []
    mlp2.best_validation_score_ = -np.inf
else:
    mlp2.best_loss_ = np.inf

mlp2.fit(X_train, Y_train)
print("Training set score: %f" % mlp2.score(X_train, Y_train))
print("Test set score: %f" % mlp2.score(X_test, Y_test))

Iteration 1, loss = 0.38710545
Iteration 2, loss = 0.38660115
Iteration 3, loss = 0.38718149
Iteration 4, loss = 0.38643883
Iteration 5, loss = 0.38527870
Iteration 6, loss = 0.38561249
Iteration 7, loss = 0.38436126
Iteration 8, loss = 0.38478164
Iteration 9, loss = 0.38437101
Iteration 10, loss = 0.38372001
Iteration 11, loss = 0.38309909
Iteration 12, loss = 0.38263756
Iteration 13, loss = 0.38233116
Iteration 14, loss = 0.38170075
Iteration 15, loss = 0.38102729
Iteration 16, loss = 0.38066330
Iteration 17, loss = 0.37970144
Iteration 18, loss = 0.37987970
Iteration 19, loss = 0.37953173
Iteration 20, loss = 0.37848023
Iteration 21, loss = 0.37801483
Iteration 22, loss = 0.37789394
Iteration 23, loss = 0.37761946
Iteration 24, loss = 0.37703934
Iteration 25, loss = 0.37677159
Iteration 26, loss = 0.37496969
Iteration 27, loss = 0.37581677
Iteration 28, loss = 0.37454357
Iteration 29, loss = 0.37367511
Iteration 30, loss = 0.37406914
Iteration 31, loss = 0.37440652
Iteration 32, los



Training set score: 0.902501
Test set score: 0.236208


In [61]:
mlp=mlp2
mlp2 = MLPClassifier(hidden_layer_sizes=(200,), max_iter=50, alpha=1e-4,
                    solver='sgd', verbose=10, tol=1e-4, random_state=1,
                    learning_rate_init=.005, warm_start=True, learning_rate='adaptive')

# _initialize
mlp2.n_iter_ = 0
mlp2.t_ = 0
mlp2.n_outputs_ = mlp.n_outputs_

# Compute the number of layers
mlp2.n_layers_ = mlp.n_layers_

# Output for multi class
mlp2.out_activation_ = mlp.out_activation_

# Initialize coefficient and intercept layers
mlp2.coefs_ = mlp.coefs_
mlp2.intercepts_ = mlp.intercepts_

# self.coefs_ = []
# self.intercepts_ = []

# for i in range(self.n_layers_ - 1):
#     coef_init, intercept_init = self._init_coef(layer_units[i],
#                                                 layer_units[i + 1])
#     self.coefs_.append(coef_init)
#     self.intercepts_.append(intercept_init)


mlp2.loss_curve_ = []
mlp2._no_improvement_count = 0
if mlp2.early_stopping:
    mlp2.validation_scores_ = []
    mlp2.best_validation_score_ = -np.inf
else:
    mlp2.best_loss_ = np.inf

mlp2.fit(X_train, Y_train)
print("Training set score: %f" % mlp2.score(X_train, Y_train))
print("Test set score: %f" % mlp2.score(X_test, Y_test))

Iteration 1, loss = 0.36426878
Iteration 2, loss = 0.36380004
Iteration 3, loss = 0.36429859
Iteration 4, loss = 0.36354746
Iteration 5, loss = 0.36247448
Iteration 6, loss = 0.36287318
Iteration 7, loss = 0.36173778
Iteration 8, loss = 0.36203876
Iteration 9, loss = 0.36174762
Iteration 10, loss = 0.36136617
Iteration 11, loss = 0.36059942
Iteration 12, loss = 0.36029864
Iteration 13, loss = 0.36022216
Iteration 14, loss = 0.35946373
Iteration 15, loss = 0.35896693
Iteration 16, loss = 0.35858001
Iteration 17, loss = 0.35767423
Iteration 18, loss = 0.35805066
Iteration 19, loss = 0.35756295
Iteration 20, loss = 0.35672987
Iteration 21, loss = 0.35616548
Iteration 22, loss = 0.35623397
Iteration 23, loss = 0.35595185
Iteration 24, loss = 0.35545152
Iteration 25, loss = 0.35524096
Iteration 26, loss = 0.35351674
Iteration 27, loss = 0.35446743
Iteration 28, loss = 0.35316783
Iteration 29, loss = 0.35239027
Iteration 30, loss = 0.35269911
Iteration 31, loss = 0.35324319
Iteration 32, los



Training set score: 0.924840
Test set score: 0.234324


In [62]:
mlp=mlp2
mlp2 = MLPClassifier(hidden_layer_sizes=(200,), max_iter=50, alpha=1e-5,
                    solver='sgd', verbose=10, tol=1e-4, random_state=1,
                    learning_rate_init=.001, warm_start=True, learning_rate='adaptive')

# _initialize
mlp2.n_iter_ = 0
mlp2.t_ = 0
mlp2.n_outputs_ = mlp.n_outputs_

# Compute the number of layers
mlp2.n_layers_ = mlp.n_layers_

# Output for multi class
mlp2.out_activation_ = mlp.out_activation_

# Initialize coefficient and intercept layers
mlp2.coefs_ = mlp.coefs_
mlp2.intercepts_ = mlp.intercepts_

# self.coefs_ = []
# self.intercepts_ = []

# for i in range(self.n_layers_ - 1):
#     coef_init, intercept_init = self._init_coef(layer_units[i],
#                                                 layer_units[i + 1])
#     self.coefs_.append(coef_init)
#     self.intercepts_.append(intercept_init)


mlp2.loss_curve_ = []
mlp2._no_improvement_count = 0
if mlp2.early_stopping:
    mlp2.validation_scores_ = []
    mlp2.best_validation_score_ = -np.inf
else:
    mlp2.best_loss_ = np.inf

mlp2.fit(X_train, Y_train)
print("Training set score: %f" % mlp2.score(X_train, Y_train))
print("Test set score: %f" % mlp2.score(X_test, Y_test))

Iteration 1, loss = 0.29884172
Iteration 2, loss = 0.29871739
Iteration 3, loss = 0.29892879
Iteration 4, loss = 0.29863202
Iteration 5, loss = 0.29831857
Iteration 6, loss = 0.29841417
Iteration 7, loss = 0.29830743
Iteration 8, loss = 0.29849440
Training loss did not improve more than tol=0.000100 for two consecutive epochs. Setting learning rate to 0.000200
Iteration 9, loss = 0.29061435
Iteration 10, loss = 0.28961098
Iteration 11, loss = 0.28951599
Iteration 12, loss = 0.28939727
Iteration 13, loss = 0.28950599
Iteration 14, loss = 0.28938731
Iteration 15, loss = 0.28936947
Training loss did not improve more than tol=0.000100 for two consecutive epochs. Setting learning rate to 0.000040
Iteration 16, loss = 0.28775280
Iteration 17, loss = 0.28749348
Iteration 18, loss = 0.28745869
Iteration 19, loss = 0.28744719
Iteration 20, loss = 0.28742563
Training loss did not improve more than tol=0.000100 for two consecutive epochs. Setting learning rate to 0.000008
Iteration 21, loss = 0.2

In [36]:
mlp.set_params(**{'warm_start': True, 'max_iter': 300})
print(mlp.get_params())

{'nesterovs_momentum': True, 'tol': 0.0001, 'hidden_layer_sizes': (200,), 'random_state': 1, 'power_t': 0.5, 'shuffle': True, 'verbose': 10, 'epsilon': 1e-08, 'activation': 'relu', 'batch_size': 'auto', 'alpha': 0.0001, 'beta_1': 0.9, 'max_iter': 300, 'early_stopping': False, 'beta_2': 0.999, 'momentum': 0.9, 'learning_rate_init': 0.005, 'learning_rate': 'constant', 'solver': 'sgd', 'warm_start': True, 'validation_fraction': 0.1}


In [45]:
mlp.fit(X_train, Y_train)
print("Training set score: %f" % mlp.score(X_train, Y_train))
print("Test set score: %f" % mlp.score(X_test, Y_test))

Iteration 257, loss = 0.65834327
Training set score: 0.800945
Test set score: 0.249009


In [72]:
np.save('coefs', mlp2.coefs_)
np.save('intercepts', mlp2.intercepts_)

In [47]:
print(mlp.classes_)
print(mlp.loss_)
print(mlp.coefs_) # weights
print(mlp.intercepts_) # biases
print(mlp.n_iter_)
print(mlp.n_layers_)
print(mlp.n_outputs_)
print(mlp.out_activation_)

[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
 198 199]
0.49418907089
[array([[-0.11212256,  0.16968417, -0.09096973, ..., -0.16198049,
         0.06640543,  0.84388988],
       [ 0.09123717,  0.07599923, -0.02663023, ..., -0.05712243,
       

In [30]:
print("Test set score: %f" % mlp.score(X_test, Y_test))

Test set score: 0.117730


In [28]:
import scipy.io as spio
spio.savemat('dataset.mat', dict(X=X_train, y=Y_train, X_test=X_test, Y_test=Y_test))

In [67]:
X_test_old = X_test
Y_test_old = Y_test
X_test = []
Y_test = []
# X_train = []
# Y_train = []

# smvn of train
for speaker_id_str, feature_list in train.items():
    speaker_id = idx[speaker_id_str]
    
    # calc speaker level mean and std
    data = []
    for features in feature_list:
        frames = features.T
        for frame in frames:
            data.append(frame)
    # test data
    for features in test[speaker_id_str]:
        frames = features.T
        for frame in frames:
            data.append(frame)
    
    data = np.array(data)
    mean = data.mean()
    std = data.std()
    
    # speaker level normalize
#     for features in feature_list:
#         features = (features - mean)/std
#         frames = concat(features)
#         for frame in frames:
#             X_train.append(frame)
#             Y_train.append(speaker_id)
            
    # test
    for features in test[speaker_id_str]:
        features = (features - mean)/std
        frames = concat(features)
        for frame in frames:
            X_test.append(frame)
            Y_test.append(speaker_id)
            
# X_train = np.array(X_train)
# Y_train = np.array(Y_train)
X_test = np.array(X_test)
Y_test = np.array(Y_test)

In [71]:
print("Test set score: %f" % mlp2.score(X_test, Y_test))

Test set score: 0.561193


In [66]:
X_test = []
Y_test = []
# X_train = []
# Y_train = []

# global mvn
data = []
for speaker_id_str, feature_list in train.items():
    speaker_id = idx[speaker_id_str]
    
    # calc speaker level mean and std
    
    for features in feature_list:
        frames = features.T
        for frame in frames:
            data.append(frame)
    # test data
    for features in test[speaker_id_str]:
        frames = features.T
        for frame in frames:
            data.append(frame)
    
data = np.array(data)
mean = data.mean()
std = data.std()
    
    # speaker level normalize
#     for features in feature_list:
#         features = (features - mean)/std
#         frames = concat(features)
#         for frame in frames:
#             X_train.append(frame)
#             Y_train.append(speaker_id)
            
    # test
for speaker_id_str, feature_list in test.items():
    speaker_id = idx[speaker_id_str]
    for features in test[speaker_id_str]:
        features = (features - mean)/std
        frames = concat(features)
        for frame in frames:
            X_test.append(frame)
            Y_test.append(speaker_id)
            
# X_train = np.array(X_train)
# Y_train = np.array(Y_train)
X_test = np.array(X_test)
Y_test = np.array(Y_test)
print("Test set score: %f" % mlp2.score(X_test, Y_test))

Test set score: 0.042033
