In [1]:
import os

from python_speech_features import mfcc
from python_speech_features import delta
from python_speech_features import logfbank
import scipy.io.wavfile as wav
from random import randint

import tensorflow as tf
import numpy as numpy

In [2]:
def get_feature_vectors(dataset_type):
    
    #set parameters for training and testing
    if (dataset_type == "train"):
        directory = os.path.join(os.getcwd(), 'data_thuyg20_sre/enroll')
        no_of_frames = 400
    elif (dataset_type == "test"):    
        directory = os.path.join(os.getcwd(), 'data_thuyg20_sre/test')
        no_of_frames = 40
    
    dataset = numpy.empty([0, 40])
    
    for file in os.listdir(directory):
        
        # filter speakers
        names = ['F101', 'F102', 'F103', 'F104', 'F105', 'M101', 'M102', 'M103', 'M104']

        if any(name in file for name in names):
            
            # extract mfcc vectors
            (rate,sig) = wav.read(os.path.join(directory, file))
            mfcc_feat = mfcc(sig,rate)
            d_mfcc_feat = delta(mfcc_feat, 2)
            dd_mfcc_feat = delta(d_mfcc_feat, 2)
            
            #fbank_feat = logfbank(sig,rate)
            mfcc_vectors = mfcc_feat[11:11+no_of_frames,:]
            dmfcc_vectors = d_mfcc_feat[11:11+no_of_frames,:]
            ddmfcc_vectors = dd_mfcc_feat[11:11+no_of_frames,:]
            
            feature_vectors = numpy.hstack((mfcc_vectors, dmfcc_vectors, ddmfcc_vectors))
            #print(feature_vectors.shape)
            
            # get speaker index from filename
            speaker_index = file.split("_")[0]
            if speaker_index[0] == 'M':
                speaker_index = 5 + int(speaker_index[3:])
            else:
                speaker_index = int(speaker_index[3:])

            #append speaker index to feature vectors
            np_speaker_index = numpy.array([speaker_index])
            temp = numpy.tile(np_speaker_index[numpy.newaxis,:], (feature_vectors.shape[0],1))
            concatenated_feature_vector = numpy.concatenate((feature_vectors,temp), axis=1)
            
            #print(dataset.shape)
            #print(concatenated_feature_vector.shape)
            # append file's data to dataset
            dataset = numpy.concatenate((dataset, concatenated_feature_vector), axis=0)
            
            

    return dataset

In [3]:
def get_feature_vectors_male(dataset_type):
    
    #set parameters for training and testing
    if (dataset_type == "train"):
        directory = os.path.join(os.getcwd(), 'data_thuyg20_sre/enroll')
        no_of_frames = 400
    elif (dataset_type == "test"):    
        directory = os.path.join(os.getcwd(), 'data_thuyg20_sre/test')
        no_of_frames = 40
    
    dataset = numpy.empty([0, 40])
    
    for file in os.listdir(directory):
        
        # filter speakers
        names = ['M101', 'M102', 'M103', 'M104', 'M105', 'M106', 'M107', 'M108', 'M109']

        if any(name in file for name in names):
            
            # extract mfcc vectors
            (rate,sig) = wav.read(os.path.join(directory, file))
            mfcc_feat = mfcc(sig,rate)
            d_mfcc_feat = delta(mfcc_feat, 2)
            dd_mfcc_feat = delta(d_mfcc_feat, 2)
            
            #fbank_feat = logfbank(sig,rate)
            mfcc_vectors = mfcc_feat[11:11+no_of_frames,:]
            dmfcc_vectors = d_mfcc_feat[11:11+no_of_frames,:]
            ddmfcc_vectors = dd_mfcc_feat[11:11+no_of_frames,:]
            
            feature_vectors = numpy.hstack((mfcc_vectors, dmfcc_vectors, ddmfcc_vectors))
            #print(feature_vectors.shape)
            
            # get speaker index from filename
            speaker_index = file.split("_")[0]
            speaker_index = int(speaker_index[2:])

            #append speaker index to feature vectors
            np_speaker_index = numpy.array([speaker_index])
            temp = numpy.tile(np_speaker_index[numpy.newaxis,:], (feature_vectors.shape[0],1))
            concatenated_feature_vector = numpy.concatenate((feature_vectors,temp), axis=1)
            
            #print(dataset.shape)
            #print(concatenated_feature_vector.shape)
            # append file's data to dataset
            dataset = numpy.concatenate((dataset, concatenated_feature_vector), axis=0)
            
            

    return dataset

In [4]:
def get_feature_vectors_female(dataset_type):
    
    #set parameters for training and testing
    if (dataset_type == "train"):
        directory = os.path.join(os.getcwd(), 'data_thuyg20_sre/enroll')
        no_of_frames = 400
    elif (dataset_type == "test"):    
        directory = os.path.join(os.getcwd(), 'data_thuyg20_sre/test')
        no_of_frames = 40
    
    dataset = numpy.empty([0, 40])
    
    for file in os.listdir(directory):
        
        # filter speakers
        names = ['F101', 'F102', 'F103', 'F104', 'F105', 'F106', 'F107', 'F108', 'F109']

        if any(name in file for name in names):
            
            # extract mfcc vectors
            (rate,sig) = wav.read(os.path.join(directory, file))
            mfcc_feat = mfcc(sig,rate)
            d_mfcc_feat = delta(mfcc_feat, 2)
            dd_mfcc_feat = delta(d_mfcc_feat, 2)
            
            #fbank_feat = logfbank(sig,rate)
            mfcc_vectors = mfcc_feat[11:11+no_of_frames,:]
            dmfcc_vectors = d_mfcc_feat[11:11+no_of_frames,:]
            ddmfcc_vectors = dd_mfcc_feat[11:11+no_of_frames,:]
            
            feature_vectors = numpy.hstack((mfcc_vectors, dmfcc_vectors, ddmfcc_vectors))
            #print(feature_vectors.shape)
            
            # get speaker index from filename
            speaker_index = file.split("_")[0]
            speaker_index = int(speaker_index[2:])

            #append speaker index to feature vectors
            np_speaker_index = numpy.array([speaker_index])
            temp = numpy.tile(np_speaker_index[numpy.newaxis,:], (feature_vectors.shape[0],1))
            concatenated_feature_vector = numpy.concatenate((feature_vectors,temp), axis=1)
            
            #print(dataset.shape)
            #print(concatenated_feature_vector.shape)
            # append file's data to dataset
            dataset = numpy.concatenate((dataset, concatenated_feature_vector), axis=0)
            
            

    return dataset

In [5]:
# from numpy import genfromtxt
my_data = get_feature_vectors("train")

In [6]:
# print(my_data)
print(my_data.shape)

(3600, 40)


In [7]:
Y = numpy.copy(my_data[:, 39:])
print(Y.shape)

(3600, 1)


In [8]:
X = numpy.copy(my_data[:, :39])
print(X.shape)
mean = X.mean(0, keepdims=True)

print(mean.shape)
std_deviation = numpy.std(X, axis=0, keepdims=True)
print(std_deviation.shape)

normalized_X = (X - mean) / std_deviation
print(normalized_X.shape)

(3600, 39)
(1, 39)
(1, 39)
(3600, 39)


In [9]:
test_model = get_feature_vectors("test")
print(test_model.shape)

test_X = numpy.copy(test_model[:, :39])
print(test_X.shape)

normalized_test_X = (test_X - mean) / std_deviation
print(normalized_test_X.shape)

test_Y = numpy.copy(test_model[:, 39:])
print(test_Y.shape)

(3600, 40)
(3600, 39)
(3600, 39)
(3600, 1)


In [10]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution1D, MaxPooling1D
from keras.utils import np_utils

from matplotlib import pyplot as plt

Using TensorFlow backend.


In [11]:
print(normalized_X.shape)
test_X = test_X.reshape(test_X.shape[0], 39, 1)
normalized_X = normalized_X.reshape(normalized_X.shape[0], 39, 1)
normalized_test_X = normalized_test_X.reshape(normalized_test_X.shape[0], 39, 1)


one_hot_labels = np_utils.to_categorical(Y, num_classes=10)
print(one_hot_labels.shape)
test_labels = np_utils.to_categorical(test_Y, num_classes=10)

(3600, 39)
(3600, 10)


In [26]:
model = Sequential()

model.add(Convolution1D(32, 13, activation='tanh', input_shape=(39,1)))
model.add(Convolution1D(32, 1, activation='tanh'))

model.add(MaxPooling1D(pool_size=(1)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(60, activation='tanh'))
model.add(Dropout(0.25))
model.add(Dense(10, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(normalized_X, one_hot_labels, epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fa0cdcb35f8>

In [27]:
print(model.test_on_batch(test_X, test_labels, sample_weight=None))
print(model.metrics_names)
predictions = model.predict(test_X)

b = [sum(predictions[current: current+40]) for current in range(0, len(predictions), 40)]
predicted_Y = []
for row in b:
    predicted_Y.append(row.argmax(axis=0))
    
# print(predicted_Y)
# print(test_Y[::40].T)

#for t, p in zip(test_Y[::40].T[0], predicted_Y):
#    print (int(t), p)

diff = predicted_Y - test_Y[::40].T[0]

numerator = sum(x == 0 for x in diff)
denominator = len(predicted_Y)

print("{} of {}".format(numerator, denominator))

print("Accuracy: {}".format(numerator/denominator))

[1.8617187, 0.45666668]
['loss', 'acc']
57 of 90
Accuracy: 0.6333333333333333
