# speaker verification for text dependent utterences using with and without Gaussian posteriograms

In [1]:
import scipy.io.wavfile as wav
import numpy as np
import librosa
from sklearn.mixture import GaussianMixture
from sklearn.metrics import confusion_matrix
from dtw import dtw
import math
import matplotlib.pyplot as plt

### function to calculate mfcc from wav file and computes CMVN 

In [2]:
# Define the number of MFCC coefficients to compute
n_mfcc = 39
def compute_mfcc(audio_file):
    # Load the audio file and its sample rate
    signal, sr = librosa.load(audio_file, sr=None)
    # Compute the MFCC features
    mfcc = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=n_mfcc)
    mfcc = mfcc.T
    mfcc_mean = np.mean(mfcc, axis=1)
    mfcc_var = np.var(mfcc, axis=1)
    # Apply CMVN
    mfcc_cmvn = (mfcc - mfcc_mean.reshape((-1, 1))) / np.sqrt(mfcc_var).reshape((-1, 1))
    return mfcc_cmvn

# 1.Speech Verification without Gaussian Posteriorgrams  

In [3]:
DTWSPEAKER = []
for sp in range(1,6):
    path = 'data4/speaker'+str(sp)+'/train/speaker'+str(sp)+'_1.wav'
    mfcc = compute_mfcc(path)
    DTWSPEAKER.append(mfcc)
DTWSPEAKER = np.array(DTWSPEAKER)

In [4]:
def speaker_verification_without_gmms(DTW_SPEAKER,path):
    mfcc = compute_mfcc(path)
    dtw_vals = []
    for i in range(5):
        dist_score, cost, acc, path = dtw(mfcc, DTW_SPEAKER[i], dist=lambda x, y: np.linalg.norm(x - y, ord=2))
        dtw_vals.append(dist_score)
    return dtw_vals.index(min(dtw_vals))

In [5]:
test_labels = []
predicted_labels = []
for sp in range(1,6):
    for j in range(1,9):
        path = 'data4/speaker'+str(sp)+'/test/speaker'+str(sp)+'_'+str(j)+'.wav'
        test_labels.append(sp)
        pred_val = speaker_verification_without_gmms(DTWSPEAKER,path)
        predicted_labels.append(pred_val+1)
conf_mat = confusion_matrix(test_labels, predicted_labels)
print("Confusion matrix:\n", conf_mat)

Confusion matrix:
 [[8 0 0 0 0]
 [0 8 0 0 0]
 [6 0 2 0 0]
 [0 0 0 8 0]
 [0 0 0 0 8]]


# speaker verification using gaussian posteriorgrams

## train GMM's for multiple speakers

In [7]:
# Define the number of Gaussian components for the GMM
n_components = 8
# Define the maximum number of iterations for the GMM training
max_iter = 100
# Define a function to compute the MFCC features for a given audio file


# Train a GMM for each digit using the training MFCC features
speaker_gmms = {}
for sp in range(1,6):
    # Compute the MFCC features for each training file
    train_files = []
    for i in range(1,4):
        path = 'data4/speaker'+str(sp)+'/train/speaker'+str(sp)+'_'+str(i)+'.wav'
        #print(path)
        train_files.append(path)
    train_mfcc = []
    
    for train_file in train_files:
        mfcc = compute_mfcc(train_file)
        train_mfcc.append(mfcc)
    
    # Extract the MFCC features for the current digit
    speaker_mfcc = train_mfcc
    
    # Train a GMM with the specified number of components
    gmm = GaussianMixture(n_components=n_components, max_iter=max_iter)
    gmm.fit(np.vstack(speaker_mfcc))

    # Store the trained GMM
    speaker_gmms[sp] = gmm


In [8]:
def multivariate_gaussian_log(x, mu, cov):
    n = len(x)
    det_cov = np.linalg.det(cov)
    inv_cov = np.linalg.inv(cov)

    exp_term = -0.5 * np.dot(np.dot((x - mu), inv_cov), (x - mu).T)
    const_term = -(n/2)*np.log(2 * math.pi) - (1/2)*np.log(det_cov)
    log_value = exp_term + const_term 
    return log_value


In [9]:
# np.set_printoptions(formatter={'float': lambda x: "{0:0.1f}".format(x)})

DTW_SPEAKER = []
for sp in range(1,6):
    path = 'data4/speaker'+str(sp)+'/train/speaker'+str(sp)+'_1.wav'
    mfcc = compute_mfcc(path)
    prob = []
    for k in mfcc:
        prob1 = []
        for i in range(8):
            pb = multivariate_gaussian_log(k,speaker_gmms[sp].means_[i],speaker_gmms[sp].covariances_[i])
            pb = pb + np.log(speaker_gmms[sp].weights_[i]) - speaker_gmms[sp].score_samples(k.reshape(1, -1))
            prob1.append(pb)
        prob1 = np.array(prob1)
        prob1 = prob1.flatten()
        norm1 = np.linalg.norm(prob1)
        prob1 = prob1/norm1
        prob.append(prob1)
    DTW_SPEAKER.append(prob)
DTW_SPEAKER = np.array(DTW_SPEAKER)

### speaker verification with Gaussian Posteriorgrams

In [12]:
def speaker_verification(DTW_SPEAKER,path):
    mfcc = compute_mfcc(path)
    
    dtw_vals  =[]
    # i range should be number of speakers
    for sp in range(1,6):
        prob = []
        for k in mfcc:
            prob1 = []
            for i in range(8):
                pb = multivariate_gaussian_log(k,speaker_gmms[sp].means_[i],speaker_gmms[sp].covariances_[i])
                pb = pb + np.log(speaker_gmms[sp].weights_[i]) - speaker_gmms[sp].score_samples(k.reshape(1, -1))
                prob1.append(pb)
            prob1 = np.array(prob1)
            prob1 = prob1.flatten()
            norm1 = np.linalg.norm(prob1)
            prob1 = prob1/norm1
            prob.append(prob1)
        
        prob = np.array(prob)
        dist_score, cost, acc, path = dtw(prob, DTW_SPEAKER[sp-1], dist=lambda x, y: np.linalg.norm(x - y, ord=2))
        dtw_vals.append(dist_score)
    return dtw_vals.index(min(dtw_vals))
    

### Testing data

In [13]:
test_labels = []
predicted_labels = []
for sp in range(1,6):
    for j in range(1,9):
        path = 'data4/speaker'+str(sp)+'/test/speaker'+str(sp)+'_'+str(j)+'.wav'
        test_labels.append(sp)
        pred_val = speaker_verification(DTW_SPEAKER,path)
        predicted_labels.append(pred_val+1)
conf_mat = confusion_matrix(test_labels, predicted_labels)
print("Confusion matrix:\n", conf_mat)

[12.215988174753157, 18.642821669883105, 18.95778363579521, 21.386789151052884, 23.60048635185177]
[11.122745490247153, 19.247870686302964, 18.526032799053066, 19.555520009884223, 22.24025739734282]
[12.698546928103244, 19.924396120286513, 20.171238835543846, 19.162944488441752, 24.241617305177552]
[9.031318507030859, 18.992290963882102, 20.55050390451259, 20.039319790310238, 23.57003088659406]
[9.483261752283196, 19.344959578117184, 20.812078340934637, 19.396315773089103, 23.654941669616694]
[12.408610232820797, 19.307124174108615, 20.389620954526123, 18.475476819191957, 24.46935064257191]
[16.161896836561315, 19.76906581924652, 21.45138818467181, 17.655209813792304, 23.9469291362952]
[17.053203108133957, 19.44336775657353, 20.55405750923431, 18.404926351222723, 23.350383998647565]
[30.449868934839284, 11.55922857863314, 23.656379653711408, 16.628799324788517, 25.53758327004849]
[29.319868309972012, 7.135457119171621, 21.306344999534474, 18.15539471174911, 22.592047944948867]
[31.3806