## Environment Setup

In [1]:
! git clone https://github.com/srivarshan-s/Speaker-Recognition.git
% cd Speaker-Recognition

Cloning into 'Speaker-Recognition'...
remote: Enumerating objects: 5333, done.[K
remote: Counting objects: 100% (5333/5333), done.[K
remote: Compressing objects: 100% (5297/5297), done.[K
remote: Total 5333 (delta 41), reused 5320 (delta 32), pack-reused 0[K
Receiving objects: 100% (5333/5333), 47.11 MiB | 21.42 MiB/s, done.
Resolving deltas: 100% (41/41), done.
/content/Speaker-Recognition


## Importing Libraries

In [2]:
import glob
import numpy as np
import random
import librosa
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

In [3]:
import tensorflow as tf
from tensorflow.keras.layers import LSTM, Dense, Dropout, Flatten
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [4]:
import os
os. environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

## Loading Data

In [5]:
SEED = 2017
DATA_DIR = 'data/' 

In [6]:
files = glob.glob(DATA_DIR + "*.wav")
X_train, X_val = train_test_split(files, test_size=0.2, random_state=SEED)

In [7]:
print('Training examples: {}'.format(len(X_train)))
print('Validation examples: {}'.format(len(X_val)))

Training examples: 1920
Validation examples: 480


In [8]:
labels = []

for i in range(len(X_train)):
    label = X_train[i].split('/')[-1].split('_')[0]
    
    if label not in labels:
        labels.append(label)

In [9]:
print(labels)

['2', '8', '0', '9', '3', '5', '1', '7', '4', '6']


## Data Preprocessing

In [10]:
label_binarizer = LabelBinarizer()
label_binarizer.fit(list(set(labels)))

LabelBinarizer()

In [11]:
def one_hot_encode(label):
    return label_binarizer.transform(label)

In [12]:
n_features = 20
max_length = 80
n_classes = len(labels)

In [13]:
def batch_generator(data, batch_size=16):
    while 1:
        random.shuffle(data)
        X, y = [], []
        for i in range(batch_size):
            wav = data[i]
            wave, sr = librosa.load(wav, mono=True)
            label = wav.split('/')[-1].split('_')[0]
            y.append(label)
            mfcc = librosa.feature.mfcc(wave, sr)
            mfcc = np.pad(mfcc, ((0,0), (0, max_length-len(mfcc[0]))), mode='constant', constant_values=0) 
            X.append(np.array(mfcc))
        yield np.array(X), np.array(one_hot_encode(y))

## Model Training

In [14]:
learning_rate = 0.001
batch_size = 64
n_epochs = 50
dropout = 0.5

In [15]:
input_shape = (n_features, max_length)
steps_per_epoch = 50

In [16]:
model = Sequential()
model.add(LSTM(256, return_sequences=True, input_shape=input_shape,
dropout=dropout))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(dropout))
model.add(Dense(n_classes, activation='softmax'))

In [17]:
opt = Adam(learning_rate=learning_rate)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 20, 256)           345088    
                                                                 
 flatten (Flatten)           (None, 5120)              0         
                                                                 
 dense (Dense)               (None, 128)               655488    
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 10)                1290      
                                                                 
Total params: 1,001,866
Trainable params: 1,001,866
Non-trainable params: 0
_________________________________________________________________


In [18]:
callbacks = [ModelCheckpoint('checkpoints/number_recognition_best_model_{epoch:02d}.hdf5', save_best_only=True),
            EarlyStopping(monitor='val_accuracy', patience=2)]

In [19]:
history = model.fit_generator(
    generator=batch_generator(X_train, batch_size),
    steps_per_epoch=steps_per_epoch,
    epochs=n_epochs,
    verbose=1,
    validation_data=batch_generator(X_val, 32),
    validation_steps=5,
    callbacks=callbacks
)

  


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50


## Load the Model from Checkpoints

In [20]:
model = Sequential()
model.add(LSTM(256, return_sequences=True, input_shape=input_shape,
dropout=dropout))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(dropout))
model.add(Dense(n_classes, activation='softmax'))

opt = Adam(learning_rate=learning_rate)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

In [22]:
model.load_weights('checkpoints/number_recognition_best_model_09.hdf5')

## Perform Number Recognition

In [23]:
wave, sr = librosa.load('data/0_Agnes_100.wav', mono=True)
mfcc = librosa.feature.mfcc(wave, sr)
mfcc = np.pad(mfcc, ((0,0), (0, max_length-len(mfcc[0]))), mode='constant', constant_values=0)

In [24]:
model_input = np.array(mfcc)
model_input = model_input.reshape(1, 20, 80)

In [25]:
model_output = model.predict(model_input)

In [26]:
print(model_output)

[[9.9357867e-01 2.2778002e-05 2.3994346e-05 4.2695906e-06 4.0498990e-06
  3.0698409e-06 5.0659627e-03 3.1897347e-04 1.3947790e-04 8.3869579e-04]]


In [27]:
pred = label_binarizer.inverse_transform(model_output)

In [28]:
print(pred)

['0']


## Perform Recognition on Own Voice

In [29]:
wave, sr = librosa.load('own_data/0_own.wav', mono=True)
mfcc = librosa.feature.mfcc(wave, sr)
# mfcc = np.pad(mfcc, ((0,0), (0, max_length-len(mfcc[0]))), mode='constant', constant_values=0)

In [30]:
model_input = np.array(mfcc)
model_input = model_input.tolist()

new_model_input = []

for i in model_input:
    new_model_input.append(i[:80])

model_input = np.array(new_model_input)

model_input = model_input.reshape(1, 20, 80)

In [31]:
model_output = model.predict(model_input)

In [32]:
print(model_output)

[[0.2870426  0.00590626 0.02756211 0.14137095 0.07293818 0.03185944
  0.00254114 0.35451174 0.00244273 0.07382479]]


In [33]:
pred = label_binarizer.inverse_transform(model_output)

In [34]:
print(pred)

['7']
