Spoken Digit Recognition (0-9) using Hidden Markov Models (HMMs)
The world has been moving fast towards a more interactive and hands-off interface for everyday tasks
and voice plays a major role in it. Voice activated or voice controlled devices, software and also AI
assistants are on the rise. The goal of this project is to develop an effective system to detect and classify
spoken digits (0-9) using Hidden Markov Models (HMMs)

Import necessary files and extensions

In [None]:
# For Audio Preprocessing
import librosa
import librosa.display as dsp
from IPython.display import Audio

# For Data Preprocessing
import pandas as pd
import numpy as np
import os

# For data viz
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

from sklearn.model_selection import train_test_split

import random

In [2]:
sns.set_style("dark")

librosa.load(file): This function is part of the Librosa library, which is commonly used for audio and music signal processing in Python. It loads the audio file specified by the file variable.



In [3]:
def get_audio(digit=0):
    # Generate a random integer for 'sample' between 1 (inclusive) and 50 (inclusive)
    sample = np.random.randint(0, 49)
  
    possible_names = ['yweweler' , 'theo' , 'nicolas' , 'lucas' , 'jackson' , 'george']
    random_name = np.random.choice(possible_names)
    
    # sample is the digit speaker_identifier is the speaker identifier and index is the identifier
    file = f"EnterDatasetlocationhere/{digit}_{random_name}_{sample}.wav"
    
    data,sample_rate = librosa.load(file)
    
    # Plot the audio wave
    dsp.waveshow(data,sr=sample_rate)
    plt.show()
    
    # Show the widget
    return Audio(data=data,rate=sample_rate)

Checking the plots of the various digits

In [None]:
# Show the audio and plot of digit 0
get_audio(0)

In [None]:
# Show the audio and plot of digit 1
get_audio(1)

In [None]:
# Show the audio and plot of digit 2
get_audio(2)

In [None]:
# Show the audio and plot of digit 3
get_audio(3)

In [None]:
# Show the audio and plot of digit 4
get_audio(4)

In [None]:
# Show the audio and plot of digit 5
get_audio(5)

In [None]:
# Show the audio and plot of digit 6
get_audio(6)

In [None]:
# Show the audio and plot of digit 7
get_audio(7)

In [None]:
# Show the audio and plot of digit 8
get_audio(8)

In [None]:
# Show the audio and plot of digit 9
get_audio(9)

A function which return audio file for a mentioned digit

In [15]:

def get_audio_raw(digit=0):
    # Generate a random integer for 'sample' between 1 (inclusive) and 50 (inclusive)
    sample = np.random.randint(0, 49)
  
    possible_names = ['yweweler' , 'theo' , 'nicolas' , 'lucas' , 'jackson' , 'george']
    random_name = np.random.choice(possible_names)
    
    # sample is the digit speaker_identifier is the speaker identifier and index is the identifier
    file = f"datasetLocationHere/{digit}_{random_name}_{sample}.wav"
    
    
    # Get Audio from the location
    data,sample_rate = librosa.load(file)

    # Return audio
    return data,sample_rate

Creating the spectograms of the various digits

In [16]:
def spectogram_of(digit):
    # Read the audio file
    data,sr = get_audio_raw(digit)
    # Apply Short-Time-Fourier-Transformer to transform data
    D = librosa.stft(data)
    # Converting frequency to decible
    S_db = librosa.amplitude_to_db(np.abs(D),ref=np.max)
    # Plot the transformed data
    librosa.display.specshow(S_db,x_axis='time',y_axis='log')
    plt.show()

Creating the subplots to display the spectogram for various data

In [None]:
# Creating subplots
fig,ax = plt.subplots(5,2,figsize=(15,30))

# Initializing row and column variables for subplots
row = 0
column = 0

for digit in range(10):  
    # Read the audio file
    data,sr = get_audio_raw(digit)
    # Apply Short-Time-Fourier-Transformer to transform data
    D = librosa.stft(data)
    # Converting frequency to decible
    S_db = librosa.amplitude_to_db(np.abs(D),ref=np.max)
    # Plot the transformed data
    ax[row,column].set_title(f"Spectogram of digit {digit}")
    librosa.display.specshow(S_db,x_axis='time',y_axis='log',ax=ax[row,column])
    
    # Conditions for positioning of the plots
    if column == 1:
        column = 0
        row += 1
    else:
        column+=1
        
    
plt.tight_layout(pad=3)   
plt.show()

MFCCs are a compact representation of the spectrum(When a waveform is represented by a summation of possibly infinite number of sinusoids) of an audio signal.
MFCC coefficients contain information about the rate changes in the different spectrum bands.
If a cepstral coefficient has a positive value, the majority of the spectral energy is concentrated in the low-frequency regions. On the other hand, if a cepstral coefficient has a negative value, it represents that most of the spectral energy is concentrated at high frequencies.

will take a audio file as input and output extracted features using MEL_FREQUENCY CEPSTRAL COEFFICIENT

In [18]:
def extract_features(file):
    # Load audio and sample rate of audio
    audio,sample_rate = librosa.load(file)
    # Extract features using mel-frequency coefficient
    extracted_features = librosa.feature.mfcc(y=audio,
                                              sr=sample_rate,
                                              n_mfcc=40)
    
    # Scale the extracted features
    extracted_features = np.mean(extracted_features.T,axis=0)
    # Return the extracted features
    return extracted_features


Creating a dataset using the extracted MFCC samples

In [19]:
def preprocess_and_create_dataset():
    # Path of folder where the audio files are present
    root_folder_path = "EnterPathOfFileWithAudioNotes/"
    
    # Empty List to create dataset
    dataset = []
    
    # Iterating through folders where each folder has audio of each digit
    for digit in tqdm(range(10), colour='green'):
        # Iterate through random names 
        for random_name in ['yweweler' , 'theo' , 'nicolas' , 'lucas' , 'jackson' , 'george']:  
            # Iterate through samples
            for sample in tqdm(range(0, 50), colour='blue'):
                # Construct the file path
                file_path = os.path.join(root_folder_path, f"{digit}_{random_name}_{sample}.wav")

                # Pass path of file to extract_features() function to create features
                extracted_features = extract_features(file_path)
                
                # Append a list where the feature represents a column and class of the digit represents another column
                dataset.append([extracted_features, str(digit)])  # Assuming 'digit' is converted to a string for class label

    # After iterating through all the folders, convert the list to a DataFrame
    print("Extracted Features and Created Dataset Successfully !!")
    return pd.DataFrame(dataset, columns=['features', 'class'])

Creating a dataset by calling the function

In [None]:
dataset = preprocess_and_create_dataset()

Checking the frist 10 elements of dataset

In [None]:
dataset.head(10)

To check the frequency of each letter ion the dataset

In [None]:
dataset['class'] = [int(x) for x in dataset['class']]
# Check the frequency of classes of audio
dataset['class'].value_counts()

A function that returns the extracted features without scaling (MFCC)

In [23]:
# def extract_features_without_scaling(audio_data,sample_rate):
#     # Extract features using mel-frequency coefficient
#     extracted_features = librosa.feature.mfcc(y=audio_data,
#                                               sr=sample_rate,
#                                               n_mfcc=40)
    
    # # Return Without Scaling
    # return extracted_features

def extract_features_without_scaling(audio_data, sample_rate):
    # Extract features using mel-frequency coefficient
    extracted_features = librosa.feature.mfcc(y=audio_data, sr=sample_rate, n_mfcc=40, n_fft=2048, hop_length=512)

    # Return without reshaping
    return extracted_features



Creating subplots for the MFCC data

In [None]:
# Creating subplots
fig,ax = plt.subplots(5,2,figsize=(15,30))

# Initializing row and column variables for subplots
row = 0
column = 0

for digit in range(10):  
    # Get Audio of different class(0-9)
    audio_data,sample_rate = get_audio_raw(digit)
    
    # Extract Its MFCC
    mfcc = extract_features_without_scaling(audio_data,sample_rate)
    print(f"Shape of MFCC of audio digit {digit} ---> ",mfcc.shape)
    
    # Display the plots and its title
    ax[row,column].set_title(f"MFCC of audio class {digit} across time")
    librosa.display.specshow(mfcc,sr=22050,ax=ax[row,column])
    
    # Set X-labels and y-labels
    ax[row,column].set_xlabel("Time")
    ax[row,column].set_ylabel("MFCC Coefficients")
    
    # Conditions for positioning of the plots
    if column == 1:
        column = 0
        row += 1
    else:
        column+=1
        
    
plt.tight_layout(pad=3)   
plt.show()

Perform train test split on the data

In [25]:
# # Seperate the audio and its class as X and Y
X = np.array(dataset['features'].to_list())
Y = np.array(dataset['class'].to_list())
# Create train set and test set
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,train_size=0.75,shuffle=True,random_state=8)

In [None]:
# # Chekcing the shape of the data
X_train.shape

In [27]:
# Import create an ANN
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# To create a checkpoint and save the best model
from tensorflow.keras.callbacks import ModelCheckpoint

# To load the model
from tensorflow.keras.models import load_model

# To check the metrics of the model
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.preprocessing import LabelBinarizer

In [28]:
# Crete a Sequential Object
model = Sequential()
# Add first layer with 100 neurons to the sequental object
model.add(Dense(100,input_shape=(40,),activation='relu'))
# Add second layer with 200 neurons to the sequental object
model.add(Dense(100,activation='relu'))
# Add third later with 100 neurons to the sequental object
model.add(Dense(100,activation='relu'))

# Output layer With 10 neurons as it has 10 classes
model.add(Dense(10,activation='softmax'))

In [None]:
# Print Summary of the model
model.summary()

In [30]:
# Compile the model
model.compile(loss='sparse_categorical_crossentropy',
              metrics=['accuracy'],
              optimizer='adam')

Model Checkpoint & Training

In [None]:
# Set the number of epochs for training
num_epochs = 100
# Set the batch size for training
batch_size = 32

# Fit the model
model.fit(X_train,Y_train,validation_data=(X_test,Y_test),epochs=num_epochs,batch_size=batch_size,verbose=1)

In [None]:
# Make predictions on test set
Y_pred = model.predict(X_test)
Y_pred = [np.argmax(i) for i in Y_pred]

In [None]:
# Print the metrics
print(classification_report(Y_test,Y_pred))

In [None]:
# Set style as dark
sns.set_style("dark")
# Set figure size
plt.figure(figsize=(15,8))

# Plot the title
plt.title("CONFUSION MATRIX FOR MNIST AUDIO PREDICTION")
# Confusion matrix
cm = confusion_matrix([int(x) for x in Y_test],Y_pred)
# Plot confusion matrix as heatmap
sns.heatmap(cm, annot=True, cmap="PuRd", fmt='g', cbar=False)
# Set x-label and y-label
plt.xlabel("ACTUAL VALUES")
plt.ylabel("PREDICTED VALUES")

# Plot the plot
plt.show()

In [None]:
# Assume you have the functions get_audio_raw, extract_features, and model defined

# Get a random digit (0-9)
random_digit = np.random.randint(0, 10)

# Get a random name from the list
name_list = ['yweweler', 'theo', 'nicolas', 'lucas', 'jackson', 'george']
random_name = random.choice(name_list)

# Get a random sample index (1-50)
random_sample = np.random.randint(0, 50)

# Construct the file path
file_path = os.path.join("EnterFilePath/", f"{random_digit}_{random_name}_{random_sample}.wav")

# Load an audio file corresponding to the random digit
audio_data, sample_rate = get_audio_raw(random_digit)

# Extract features from the loaded audio file
extracted_features_rand = extract_features(file_path)

# Reshape the features to match the input shape expected by the model
reshaped_features = extracted_features_rand.reshape(1, -1)

# Make predictions using the trained model
prediction = model.predict(reshaped_features)
predicted_digit = np.argmax(prediction)

# Display the original digit, predicted digit, and the audio wave
print(f"Original Digit: {random_digit}")
print(f"Predicted Digit: {predicted_digit}")

# Plot the audio wave
dsp.waveshow(audio_data, sr=sample_rate)
plt.show()
