# Spoken Digits Classification

## Install the necessary library

In [1]:
pip install librosa

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Import the Library

In [2]:
import librosa

In [3]:
y, sr = librosa.load(librosa.ex('libri1'))

librosa.feature.mfcc(y=y, sr=sr)

Downloading file '5703-47212-0000.ogg' from 'https://librosa.org/data/audio/5703-47212-0000.ogg' to '/root/.cache/librosa'.


array([[-565.9195   , -564.28815  , -562.8431   , ..., -437.97177  ,
        -426.48358  , -434.66782  ],
       [  10.304619 ,   12.508708 ,   14.130634 , ...,   89.98185  ,
          88.43013  ,   90.120255 ],
       [   9.748489 ,   11.672071 ,   12.139027 , ...,  -10.730866 ,
         -10.773367 ,   -5.2245684],
       ...,
       [   3.0373316,    2.2702916,   -1.3416169, ...,   -9.08409  ,
          -6.01392  ,   -9.021893 ],
       [   2.8072867,    2.0679104,   -1.9095677, ...,   -6.449067 ,
          -6.7254505,   -5.1586237],
       [   2.8223813,    2.2442045,   -1.6846415, ...,   -8.387408 ,
          -6.1984453,   -6.176938 ]], dtype=float32)

## Import other libraries

In [23]:
import numpy as np

import os

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import warnings

warnings.filterwarnings('ignore')


## Unzip the file

In [4]:
!unzip spoken_digits.zip

Archive:  spoken_digits.zip
   creating: spoken_digits/
  inflating: spoken_digits/0_jackson_0.wav  
  inflating: spoken_digits/0_jackson_1.wav  
  inflating: spoken_digits/0_jackson_10.wav  
  inflating: spoken_digits/0_jackson_11.wav  
  inflating: spoken_digits/0_jackson_12.wav  
  inflating: spoken_digits/0_jackson_13.wav  
  inflating: spoken_digits/0_jackson_14.wav  
  inflating: spoken_digits/0_jackson_15.wav  
  inflating: spoken_digits/0_jackson_16.wav  
  inflating: spoken_digits/0_jackson_17.wav  
  inflating: spoken_digits/0_jackson_18.wav  
  inflating: spoken_digits/0_jackson_19.wav  
  inflating: spoken_digits/0_jackson_2.wav  
  inflating: spoken_digits/0_jackson_20.wav  
  inflating: spoken_digits/0_jackson_21.wav  
  inflating: spoken_digits/0_jackson_22.wav  
  inflating: spoken_digits/0_jackson_23.wav  
  inflating: spoken_digits/0_jackson_24.wav  
  inflating: spoken_digits/0_jackson_25.wav  
  inflating: spoken_digits/0_jackson_26.wav  
  inflating: spoken_digits/

## Extract Audio Features with MFCC

In [14]:
# Custom function to extract MFCC features from a given audio file

def features_extractor(file):
  audio, sample_rate = librosa.load(file)
  mfccs_features = librosa.feature.mfcc(y = audio, sr = sample_rate, n_mfcc=40)
  #print(mfccs_features.shape)
  mfccs_scaled_features = np.mean(mfccs_features.T, axis=0)
  #print(mfccs_scaled_features.shape)

  return mfccs_scaled_features


## Build Training/Test Dataset

In [None]:
# files = sorted(os.listdir('./spoken_digits/'))
# files

In [10]:

# Extract the features from all the audio files in the dataset
# Create a list to store all the features adn also the target labels

def build_dataset(sound_path='./spoken_digits/'):
  files = sorted(os.listdir(sound_path))

  X_train = []
  y_train = []
  X_test = []
  y_test = []
 
  i = 0

  for f in files:
    feature = features_extractor(file=sound_path + f)

    if i % 5 == 0:
      X_test.append(feature)
      y_test.append(int(f[0]))
    else:
      X_train.append(feature)
      y_train.append(int(f[0]))

    i += 1

  return X_train, y_train, X_test, y_test


In [15]:
X_train, y_train, X_test, y_test = build_dataset()

In [16]:
len(X_train)

1200

In [20]:
len(X_test)

300

In [18]:
X_train[0].shape

(40,)

In [19]:
y_train

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,


## Label Encode the Target Class Labels

In [27]:
labelencoder = LabelEncoder()
en_y_train = labelencoder.fit_transform(y_train)
en_y_test = labelencoder.fit_transform(y_test)

In [25]:
en_y_train

array([0, 0, 0, ..., 9, 9, 9])

In [28]:
en_y_test

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
       7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8,
       8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
       8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
       9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9])

In [29]:
unique_vals, val_count = np.unique(y_train, return_counts=True)
print(np.asarray((unique_vals, val_count)).T)

[[  0 120]
 [  1 120]
 [  2 120]
 [  3 120]
 [  4 120]
 [  5 120]
 [  6 120]
 [  7 120]
 [  8 120]
 [  9 120]]


In [30]:
unique_vals, val_count = np.unique(y_test, return_counts=True)
print(np.asarray((unique_vals, val_count)).T)

[[ 0 30]
 [ 1 30]
 [ 2 30]
 [ 3 30]
 [ 4 30]
 [ 5 30]
 [ 6 30]
 [ 7 30]
 [ 8 30]
 [ 9 30]]


## Build the Logistic Regression Classifier

In [33]:
lr = LogisticRegression(max_iter=2000)
lr.fit(X_train, en_y_train)  # supervised learning

## Make Prediction

In [34]:
y_pred_train = lr.predict(X_train)
y_pred_test = lr.predict(X_test)


## Check Accuracy

In [35]:
accuracy_score(en_y_train, y_pred_train)

1.0

In [36]:
accuracy_score(en_y_test, y_pred_test)

0.9433333333333334

## Confusion Matrix

In [37]:
confusion_matrix(en_y_test, y_pred_test)

array([[30,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0, 30,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 1,  0, 28,  1,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  2, 27,  0,  0,  0,  1,  0,  0],
       [ 0,  0,  0,  0, 28,  1,  0,  1,  0,  0],
       [ 0,  0,  0,  0,  0, 29,  0,  0,  0,  1],
       [ 0,  0,  1,  2,  0,  0, 24,  1,  2,  0],
       [ 0,  0,  0,  0,  0,  0,  3, 27,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0, 30,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0, 30]])

## Classification Report

In [38]:
print(classification_report(en_y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98        30
           1       1.00      1.00      1.00        30
           2       0.90      0.93      0.92        30
           3       0.90      0.90      0.90        30
           4       1.00      0.93      0.97        30
           5       0.97      0.97      0.97        30
           6       0.89      0.80      0.84        30
           7       0.90      0.90      0.90        30
           8       0.94      1.00      0.97        30
           9       0.97      1.00      0.98        30

    accuracy                           0.94       300
   macro avg       0.94      0.94      0.94       300
weighted avg       0.94      0.94      0.94       300

