In [1]:
import librosa
import numpy as np

indir = 'chunks/' # already VAD
from os import listdir
from os.path import isfile, join
mypath = 'chunks/'
flist = [f for f in listdir(indir) if isfile(join(indir, f))]
print(len(flist), flist[0])

2289 DR1-MCPM0-SA1-00.wav


In [2]:
def get_attributes(fname):
    attr = fname.split('.')[0].split('-')
    dialect = attr[0]
    gender = attr[1][0]
    speaker_id = attr[1]
    sentence_type = attr[2][:2]
    return dialect, gender, speaker_id, sentence_type

In [3]:
train = {}
test = {}

for fname in flist:
    input_path = indir + fname
    y, sr = librosa.load(input_path, sr=None) # set sr=None for orig file sr otherwise it is converted to ~22K

    # scaling the maximum of absolute amplitude to 1
    processed_data = y/max(abs(y))
    
    # TODO: calc VAD (already done)
    
    # https://groups.google.com/forum/#!topic/librosa/V4Z1HpTKn8Q
    mfcc = librosa.feature.mfcc(y=processed_data, sr=sr, n_mfcc=13, n_fft=(25*sr)//1000, hop_length=(10*sr)//1000)
    mfcc[0] = librosa.feature.rmse(processed_data, hop_length=int(0.010*sr), n_fft=int(0.025*sr)) 
    mfcc_delta = librosa.feature.delta(mfcc)
    mfcc_delta2 = librosa.feature.delta(mfcc, order=2)
    features = np.vstack([mfcc, mfcc_delta, mfcc_delta2]) 
    
    # split train test
    dialect, gender, speaker_id, sentence_type = get_attributes(fname)
    if sentence_type == 'SA':
        test.setdefault(speaker_id, []).append(features)
    else:
        train.setdefault(speaker_id, []).append(features)

In [4]:
ids = list(test.keys())
ids.sort()
print(ids)

idx = {}
for i in range(len(ids)):
    idx[ids[i]] = i # TODO: for MATLAB set i+1 (i.e 1 to 200)
print(idx)

['MADC0', 'MAEB0', 'MAKB0', 'MAKR0', 'MAPV0', 'MARC0', 'MARW0', 'MBEF0', 'MBGT0', 'MBJV0', 'MBMA0', 'MBWP0', 'MCAL0', 'MCDC0', 'MCDD0', 'MCDR0', 'MCEF0', 'MCEW0', 'MCHL0', 'MCLM0', 'MCPM0', 'MCSS0', 'MCTM0', 'MDAC0', 'MDAS0', 'MDBB1', 'MDBP0', 'MDCD0', 'MDDC0', 'MDEF0', 'MDEM0', 'MDHL0', 'MDHS0', 'MDJM0', 'MDLB0', 'MDLC0', 'MDLC2', 'MDLH0', 'MDMA0', 'MDMT0', 'MDNS0', 'MDPK0', 'MDPS0', 'MDSJ0', 'MDSS0', 'MDSS1', 'MDTB0', 'MDWD0', 'MDWH0', 'MDWM0', 'MEDR0', 'MEFG0', 'MEGJ0', 'MESG0', 'MEWM0', 'MFER0', 'MFMC0', 'MFRM0', 'MFWK0', 'MGAF0', 'MGAG0', 'MGES0', 'MGJC0', 'MGRL0', 'MGRP0', 'MGSH0', 'MGXP0', 'MHIT0', 'MHJB0', 'MHMG0', 'MHMR0', 'MHRM0', 'MILB0', 'MJAC0', 'MJAE0', 'MJBG0', 'MJDA0', 'MJDC0', 'MJDE0', 'MJEB0', 'MJEB1', 'MJEE0', 'MJHI0', 'MJJB0', 'MJJJ0', 'MJKR0', 'MJLB0', 'MJLG1', 'MJLS0', 'MJMA0', 'MJMD0', 'MJMM0', 'MJPM0', 'MJPM1', 'MJRH0', 'MJRH1', 'MJRP0', 'MJSR0', 'MJWS0', 'MJWT0', 'MJXL0', 'MKAH0', 'MKAJ0', 'MKAM0', 'MKDT0', 'MKJO0', 'MKLS0', 'MKLS1', 'MKLW0', 'MKXL0', 'MLBC0', 

In [5]:
def concat(x, win_size=10, hop_size=3):
    r, c = x.shape
    y = []
    for i in range(0, c, hop_size):
        if i + win_size > c:
            break
        y.append(x[:, i:i + win_size].T.flatten())
    return np.array(y)

In [6]:
X_test = []
Y_test = []
X_train = []
Y_train = []

# smvn of train + test
for speaker_id_str, feature_list in train.items():
    speaker_id = idx[speaker_id_str]
    
    # calc speaker level mean and std
    data = []
    for features in feature_list:
        frames = features.T
        for frame in frames:
            data.append(frame)
    # test data
    for features in test[speaker_id_str]:
        frames = features.T
        for frame in frames:
            data.append(frame)
    
    data = np.array(data)
    mean = data.mean()
    std = data.std()
    
    # speaker level normalize
    for features in feature_list:
        features = (features - mean)/std
        frames = concat(features)
        for frame in frames:
            X_train.append(frame)
            Y_train.append(speaker_id)
            
    # test
    for features in test[speaker_id_str]:
        features = (features - mean)/std
        frames = concat(features)
        for frame in frames:
            X_test.append(frame)
            Y_test.append(speaker_id)
            
X_train = np.array(X_train)
Y_train = np.array(Y_train)
X_test = np.array(X_test)
Y_test = np.array(Y_test)

In [7]:
print(X_train.shape, X_test.shape)
print(Y_train.shape, Y_test.shape)

(137520, 390) (36091, 390)
(137520,) (36091,)


In [8]:
# shuffle training data
from sklearn.utils import shuffle
X_train, Y_train = shuffle(X_train, Y_train)

In [9]:
from sklearn.neural_network import MLPClassifier
# mlp = MLPClassifier(hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
#                     solver='sgd', verbose=10, tol=1e-4, random_state=1)
mlp = MLPClassifier(hidden_layer_sizes=(200,), max_iter=1000, alpha=1e-4,
                    solver='sgd', verbose=10, tol=1e-4, random_state=1,
                    learning_rate_init=.01, learning_rate='adaptive',
                    warm_start=True)

mlp.fit(X_train, Y_train)
print("Training set score: %f" % mlp.score(X_train, Y_train))
print("Test set score: %f" % mlp.score(X_test, Y_test))

Iteration 1, loss = 4.79987856
Iteration 2, loss = 3.97951609
Iteration 3, loss = 3.49010571
Iteration 4, loss = 3.19037954
Iteration 5, loss = 2.97333647
Iteration 6, loss = 2.79801911
Iteration 7, loss = 2.65466405
Iteration 8, loss = 2.52934152
Iteration 9, loss = 2.42031341
Iteration 10, loss = 2.32323049
Iteration 11, loss = 2.23694064
Iteration 12, loss = 2.15878638
Iteration 13, loss = 2.08918504
Iteration 14, loss = 2.02254103
Iteration 15, loss = 1.96315661
Iteration 16, loss = 1.90854563
Iteration 17, loss = 1.85891213
Iteration 18, loss = 1.81251427
Iteration 19, loss = 1.76697304
Iteration 20, loss = 1.72508257
Iteration 21, loss = 1.68809449
Iteration 22, loss = 1.65125448
Iteration 23, loss = 1.61834834
Iteration 24, loss = 1.58774619
Iteration 25, loss = 1.55654125
Iteration 26, loss = 1.52949976
Iteration 27, loss = 1.50327693
Iteration 28, loss = 1.47636920
Iteration 29, loss = 1.45327254
Iteration 30, loss = 1.42730093
Iteration 31, loss = 1.40852136
Iteration 32, los

Iteration 253, loss = 0.48323043
Iteration 254, loss = 0.48205790
Iteration 255, loss = 0.48144245
Iteration 256, loss = 0.48064412
Iteration 257, loss = 0.47887409
Iteration 258, loss = 0.47836350
Iteration 259, loss = 0.47660211
Iteration 260, loss = 0.47503518
Iteration 261, loss = 0.47466223
Iteration 262, loss = 0.47414510
Iteration 263, loss = 0.47354456
Iteration 264, loss = 0.47033050
Iteration 265, loss = 0.47184144
Iteration 266, loss = 0.46841501
Iteration 267, loss = 0.46784961
Iteration 268, loss = 0.46650926
Iteration 269, loss = 0.46481784
Iteration 270, loss = 0.46439410
Iteration 271, loss = 0.46371653
Iteration 272, loss = 0.46222881
Iteration 273, loss = 0.46092767
Iteration 274, loss = 0.46068179
Iteration 275, loss = 0.45951787
Iteration 276, loss = 0.45850728
Iteration 277, loss = 0.45678454
Iteration 278, loss = 0.45499009
Iteration 279, loss = 0.45622813
Iteration 280, loss = 0.45689919
Iteration 281, loss = 0.45274231
Iteration 282, loss = 0.45036986
Iteration 

In [10]:
np.save('coefs_smvn_train_test', mlp.coefs_)
np.save('intercepts_smvn_train_test', mlp.intercepts_)

In [13]:
X_test2 = X_test
Y_test2 = Y_test

X_test = []
Y_test = []

from sklearn import preprocessing

# mvn of test
for speaker_id, feature_list in test.items():
    speaker_id = idx[speaker_id]
    for features in feature_list:
        features = preprocessing.scale(features.T).T
        frames = concat(features)
        for frame in frames:
            X_test.append(frame)
            Y_test.append(speaker_id)
            
X_test = np.array(X_test)
Y_test = np.array(Y_test)

In [14]:
print("Test set score: %f" % mlp.score(X_test, Y_test))

Test set score: 0.037572
