## Environment Setup

In [1]:
! git clone https://github.com/srivarshan-s/Speaker-Recognition.git
% cd Speaker-Recognition

Cloning into 'Speaker-Recognition'...
remote: Enumerating objects: 5315, done.[K
remote: Counting objects: 100% (5315/5315), done.[K
remote: Compressing objects: 100% (5284/5284), done.[K
remote: Total 5315 (delta 33), reused 5306 (delta 28), pack-reused 0[K
Receiving objects: 100% (5315/5315), 36.36 MiB | 12.97 MiB/s, done.
Resolving deltas: 100% (33/33), done.
/content/Speaker-Recognition


## Importing Libraries

In [2]:
import glob
import numpy as np
import random
import librosa
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

In [3]:
import tensorflow as tf
from tensorflow.keras.layers import LSTM, Dense, Dropout, Flatten
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [4]:
import os
os. environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

## Loading Data

In [5]:
SEED = 2017
DATA_DIR = 'data/' 

In [6]:
files = glob.glob(DATA_DIR + "*.wav")
X_train, X_val = train_test_split(files, test_size=0.2, random_state=SEED)

In [7]:
print('Training examples: {}'.format(len(X_train)))
print('Validation examples: {}'.format(len(X_val)))

Training examples: 1920
Validation examples: 480


In [8]:
labels = []

for i in range(len(X_train)):
    label = X_train[i].split('/')[-1].split('_')[1]
    
    if label not in labels:
        labels.append(label)

In [9]:
print(labels)

['Junior', 'Daniel', 'Steffi', 'Victoria', 'Tom', 'Samantha', 'Princess', 'Ralph', 'Alex', 'Kathy', 'Agnes', 'Bruce', 'Albert', 'Fred', 'Vicki']


## Data Preprocessing

In [10]:
label_binarizer = LabelBinarizer()
label_binarizer.fit(list(set(labels)))

LabelBinarizer()

In [11]:
def one_hot_encode(label):
    return label_binarizer.transform(label)

In [12]:
n_features = 20
max_length = 80
n_classes = len(labels)

In [13]:
def batch_generator(data, batch_size=16):
    while 1:
        random.shuffle(data)
        X, y = [], []
        for i in range(batch_size):
            wav = data[i]
            wave, sr = librosa.load(wav, mono=True)
            label = wav.split('/')[-1].split('_')[1]
            y.append(label)
            mfcc = librosa.feature.mfcc(wave, sr)
            mfcc = np.pad(mfcc, ((0,0), (0, max_length-len(mfcc[0]))), mode='constant', constant_values=0) 
            X.append(np.array(mfcc))
        yield np.array(X), np.array(one_hot_encode(y))

## Model Training

In [14]:
learning_rate = 0.001
batch_size = 64
n_epochs = 50
dropout = 0.5

In [15]:
input_shape = (n_features, max_length)
steps_per_epoch = 50

In [16]:
model = Sequential()
model.add(LSTM(256, return_sequences=True, input_shape=input_shape,
dropout=dropout))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(dropout))
model.add(Dense(n_classes, activation='softmax'))

In [17]:
opt = Adam(learning_rate=learning_rate)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 20, 256)           345088    
                                                                 
 flatten (Flatten)           (None, 5120)              0         
                                                                 
 dense (Dense)               (None, 128)               655488    
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 15)                1935      
                                                                 
Total params: 1,002,511
Trainable params: 1,002,511
Non-trainable params: 0
_________________________________________________________________


In [18]:
callbacks = [ModelCheckpoint('checkpoints/voice_recognition_best_model_{epoch:02d}.hdf5', save_best_only=True),
            EarlyStopping(monitor='val_accuracy', patience=2)]

In [19]:
history = model.fit_generator(
    generator=batch_generator(X_train, batch_size),
    steps_per_epoch=steps_per_epoch,
    epochs=n_epochs,
    verbose=1,
    validation_data=batch_generator(X_val, 32),
    validation_steps=5,
    callbacks=callbacks
)

  


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50


## Load the Model from Checkpoints

In [32]:
model = Sequential()
model.add(LSTM(256, return_sequences=True, input_shape=input_shape,
dropout=dropout))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(dropout))
model.add(Dense(n_classes, activation='softmax'))

opt = Adam(learning_rate=learning_rate)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

In [33]:
model.load_weights('checkpoints/voice_recognition_best_model_12.hdf5')

## Perform Speaker Recognition

In [34]:
wave, sr = librosa.load('data/0_Agnes_100.wav', mono=True)
mfcc = librosa.feature.mfcc(wave, sr)
mfcc = np.pad(mfcc, ((0,0), (0, max_length-len(mfcc[0]))), mode='constant', constant_values=0)

In [35]:
model_input = np.array(mfcc)
model_input = model_input.reshape(1, 20, 80)

In [36]:
model_output = model.predict(model_input)

In [37]:
print(model_output)

[[8.4227544e-01 7.5190297e-07 2.1749983e-02 6.5446432e-07 4.7448056e-04
  7.7964836e-07 2.6274284e-02 2.7151461e-04 9.5497072e-02 8.7537402e-03
  1.9678359e-03 6.7181670e-04 1.3139777e-05 8.5734029e-04 1.1910251e-03]]


In [40]:
pred = label_binarizer.inverse_transform(model_output)

In [41]:
print(pred)

['Agnes']


## Perform Recognition on Own Voice

In [89]:
wave, sr = librosa.load('own_data/0_own.wav', mono=True)
mfcc = librosa.feature.mfcc(wave, sr)
# mfcc = np.pad(mfcc, ((0,0), (0, max_length-len(mfcc[0]))), mode='constant', constant_values=0)

In [90]:
model_input = np.array(mfcc)
model_input = model_input.tolist()

new_model_input = []

for i in model_input:
    new_model_input.append(i[:80])

model_input = np.array(new_model_input)

model_input = model_input.reshape(1, 20, 80)

In [91]:
model_output = model.predict(model_input)

In [92]:
print(model_output)

[[8.4536329e-02 5.9525924e-08 2.0142584e-01 1.3455414e-05 6.8738249e-05
  7.6857963e-05 1.6454317e-01 7.8974560e-02 5.0639375e-03 3.0583099e-03
  6.0094148e-06 5.0554087e-04 1.4652185e-05 2.8091407e-01 1.8079846e-01]]


In [93]:
pred = label_binarizer.inverse_transform(model_output)

In [94]:
print(pred)

['Vicki']
