In [4]:
import scipy.io.wavfile as wav
from dtw import dtw
import matplotlib.pyplot as plt
import numpy as np
import librosa
from sklearn.mixture import GaussianMixture
from sklearn.metrics import confusion_matrix

# GMM model training for digit recognisation

In [5]:
# Define the number of MFCC coefficients to compute
n_mfcc = 39

# Define the number of Gaussian components for the GMM
n_components = 32

# Define the maximum number of iterations for the GMM training
max_iter = 100

# Define a function to compute the MFCC features for a given audio file
def compute_mfcc(audio_file):
    # Load the audio file and its sample rate
    signal, sr = librosa.load(audio_file, sr=None)

    # Compute the MFCC features
    mfcc = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=n_mfcc)

    return mfcc.T


# Train a GMM for each digit using the training MFCC features
digit_gmms = {}
for digit in range(10):
    # Compute the MFCC features for each training file
    train_files = []
    arr = [1,2,3,16,18]
    for i in arr:
        path = 'data/'+str(digit)+'/190108050_'+str(digit)+'_'+str(i)+'.wav'
        #print(path)
        train_files.append(path)
    train_mfcc = []
    for train_file in train_files:
        mfcc1 = compute_mfcc(train_file)
        train_mfcc.append(mfcc1)
    # Extract the MFCC features for the current digit
    digit_mfcc = train_mfcc

    # Train a GMM with the specified number of components
    gmm = GaussianMixture(n_components=n_components, max_iter=max_iter)
    gmm.fit(np.vstack(digit_mfcc))

    # Store the trained GMM
    digit_gmms[digit] = gmm


# GMM model for digit recognisation testing

In [6]:
# Compute the MFCC features for each test file
test_mfcc = []
test_labels = []

for digit in range(10):
    for i in range(1,26):
        if i in arr:
            pass
        else:
            path = 'data/'+str(digit)+'/190108050_'+str(digit)+'_'+str(i)+'.wav'
            mfcc = compute_mfcc(path)
            test_mfcc.append(mfcc)
            test_labels.append(digit)

# Perform digit recognition using the trained GMMs
predicted_labels = []
for mfcc in test_mfcc:
    # Compute the log-likelihood of the MFCC features under each GMM
    log_likelihoods = []
    for digit, gmm in digit_gmms.items():
        log_likelihood = gmm.score(mfcc)
        log_likelihoods.append(log_likelihood)

    # Identify the digit with the highest log-likelihood
    predicted_label = np.argmax(log_likelihoods)
    predicted_labels.append(predicted_label)

# Compute the confusion matrix for the recognition task
conf_mat = confusion_matrix(test_labels, predicted_labels)
print("Confusion matrix:\n", conf_mat)

    

Confusion matrix:
 [[20  0  0  0  0  0  0  0  0  0]
 [ 0 16  0  0  0  0  0  4  0  0]
 [ 0  0 20  0  0  0  0  0  0  0]
 [ 0  0  0 20  0  0  0  0  0  0]
 [ 0  0  0  0 20  0  0  0  0  0]
 [ 0  0  0  0  0 18  0  2  0  0]
 [ 0  0  0  0  0  0 20  0  0  0]
 [ 0  0  0  0  0  0  0 20  0  0]
 [ 0  0  0  0  0  0  0  0 20  0]
 [ 0  0  0  0  0  0  0  0  0 20]]
