## Import the libraries


In [None]:
!pip install pyunpack
!pip install patool
!pip install py7zr
!pip install sounddevice
!pip install noisereduce
!pip install librosa
! pip install python_speech_features
! pip install tensorflow==2.4
! pip install malaya_speech
! pip install webrtcvad

In [None]:
pip install py7zr

In [None]:
#pip update huggingface_hub
! pip install --upgrade transformers

## **importing libraries**

In [None]:
pip install git+https://github.com/huggingface/transformers

In [None]:
import numpy as np                        # linear algebra library
import pandas as pd                       # data frames processing
import matplotlib.pyplot as plt          # visualization library
import seaborn as sn                      # visualization library


# audio processing library
import librosa                          
import IPython.display as ipd            
from scipy.io import wavfile
import noisereduce as nr
from malaya_speech import Pipeline
import malaya_speech
from python_speech_features import mfcc
from sklearn.preprocessing import LabelEncoder


# unpacking the dataset
from py7zr import unpack_7zarchive    

#operating system libraries
import shutil
import warnings
warnings.filterwarnings("ignore")
import tensorflow 
import os

In [None]:
# unpacking the dataset
shutil.register_unpack_format('7zip', ['.7z'], unpack_7zarchive)
shutil.unpack_archive('/kaggle/input/tensorflow-speech-recognition-challenge/train.7z', '/kaggle/working/tensorflow-speech-recognition-challenge/train/')

# <center> Implementing the Speech Recognition Model in Python
**Dataset used for our Speech Recognition Project**
    
It is a set of 10 numbers each is repeated 2000 times with different accents and different back ground conditions. TensorFlow recently released the Speech Commands Datasets. It includes 65,000 one-second long utterances of 30 short words, by thousands of different people. We’ll build a speech recognition system that understands simple spoken commands. <br>    
    
__You can download the dataset from__ [here](https://www.kaggle.com/c/tensorflow-speech-recognition-challenge).


**Data Exploration and Visualization**

Data Exploration and Visualization helps us to understand the data as well as pre-processing steps in a better way. 



In [None]:
train_audio_path = '/kaggle/working/tensorflow-speech-recognition-challenge/train/train/audio/' #path of the training data

**Accessing a sample file in data**

In [None]:
samples, sample_rate = librosa.load(train_audio_path+'on/5a3712c9_nohash_1.wav', sr = 16000)  # loading a sample to be explored carefully
fig = plt.figure(figsize=(14, 8))
ax1 = fig.add_subplot(211)
ax1.set_title('Raw wave of ' + '../input/train/audio/on/0a7c2a8d_nohash_0.wav')
ax1.set_xlabel('time')
ax1.set_ylabel('Amplitude')
ax1.plot(np.linspace(0, sample_rate/len(samples), sample_rate), samples)

In [None]:
fs=16000
ipd.Audio(samples, rate=fs)  #listen to audio file before noise reduction
print("Sample Rate:",fs)
sr=fs

In [None]:
# Noise Reduction
time = np.linspace(0, len(samples - 1) / fs, len(samples - 1))
reduced_noise1 = nr.reduce_noise(y=samples, sr=fs,stationary=True)
plt.plot(time, reduced_noise1)  # plot in seconds
plt.xlabel("Time [seconds]")
plt.ylabel("Voice amplitude")
plt.show()

In [None]:
ipd.Audio(reduced_noise1, rate=sample_rate)  #listen to audio file after noise reduction

In [None]:
#Silence Removal
vad = malaya_speech.vad.webrtc()
y=reduced_noise1
y_= malaya_speech.resample(y, sr, 16000)
y_ = malaya_speech.astype.float_to_int(y_)
frames = malaya_speech.generator.frames(y, 30, sr)
frames_ = list(malaya_speech.generator.frames(y_, 30, 16000, append_ending_trail = False))
frames_webrtc = [(frames[no], vad(frame)) for no, frame in enumerate(frames_)]
y_ = malaya_speech.combine.without_silent(frames_webrtc)

In [None]:
ipd.Audio(y_, rate = sr )     #listen to audio file after noise reduction

In [None]:
# padding signal with zeros
zero = np.zeros((1*sr-y_.shape[0]))
signal = np.concatenate((y_,zero))
signal.shape
time = np.linspace(0, len(signal - 1) / fs, len(signal - 1))

**Visualization of Audio signal in time series domain**

Now, we’ll visualize the audio signal in the time series domain:

In [None]:
plt.plot(time,signal)

In [None]:
labels=os.listdir(train_audio_path)     #Extracting labels to determine classes

## **Distribution of the Data set**

In [None]:
#find count of each label and plot bar graph
no_of_recordings=[]
for label in labels:
    waves = [f for f in os.listdir(train_audio_path + '/'+ label) if f.endswith('.wav')]
    no_of_recordings.append(len(waves))
    
#plot
plt.figure(figsize=(30,5))
index = np.arange(len(labels))
plt.bar(index, no_of_recordings)
plt.xlabel('Commands', fontsize=12)
plt.ylabel('No of recordings', fontsize=12)
plt.xticks(index, labels, fontsize=15, rotation=60)
plt.title('No. of recordings for each command')
plt.show()

## Words used as Classes for data set

In [None]:
labels=["zero","one","two","three","four","five","six","seven","eight","nine"]

## **Preprocessing the audio waves**

let us read the audio waves and use the below-preprocessing steps :

* Noise Reduction
* Silence Removal
* Extracting MFCCs


In [None]:
sr=16000    # sample rate
vad = malaya_speech.vad.webrtc()
all_wave = []     #intitialize array to stack wave files of the whole data set in it 
all_label = []    #intitialize array to stack label of wave files of the whole data set in it 
for label in labels:
    print(label)
    waves = [f for f in os.listdir(train_audio_path + '/'+ label) if f.endswith('.wav')] # access on each file
    for wav in waves:
        samples, sample_rate = librosa.load(train_audio_path + '/' + label + '/' + wav, sr = 16000)
        samples = nr.reduce_noise(y=samples, sr=sr,stationary=True)  #noise reduction
        y_= malaya_speech.resample(samples, sr, 16000)               # silence removal
        y_ = malaya_speech.astype.float_to_int(y_)
        frames = malaya_speech.generator.frames(samples, 30, sr)
        frames_ = list(malaya_speech.generator.frames(y_, 30, 16000, append_ending_trail = False))
        frames_webrtc = [(frames[no], vad(frame)) for no, frame in enumerate(frames_)]
        y_ = malaya_speech.combine.without_silent(frames_webrtc)
        zero = np.zeros(((1*sr+4000)-y_.shape[0]))                 
        signal = np.concatenate((y_,zero))     # concatenation with zeros to adust length of the vector
        all_wave.append(signal)     #append waves one by one 
        all_label.append(label)     #append corresponding label one by one

In [None]:
print("shape of waves array",np.array(all_wave).shape)
print("shape of labels array",np.array(all_label).shape)

#Inspecting random sample
time = np.linspace(0, len(signal - 1) / fs, len(signal - 1))
plt.plot(time,np.array(all_wave)[2000,:])
print(np.array(all_label)[2000])
ipd.Audio(np.array(all_wave)[2000,:], rate = sr )

## Extracting MFCCs

In [None]:
all_mfcc=[]    #intitialize array to stack MFCCs of the whole data set in it
for wave in all_wave:
    i=0
    mfcc_feat = mfcc(wave , fs, winlen=256/fs, winstep=256/(2*fs), numcep=13, nfilt=26, nfft=256,
                 lowfreq=0, highfreq=fs/2, preemph=0.97, ceplifter=22, appendEnergy=True, winfunc=np.hamming)
    mfcc_feat= np.transpose(mfcc_feat)
    all_mfcc.append(mfcc_feat)

In [None]:
print("shape of MFCCs",np.array(all_mfcc).shape)
print("shape of Corresponding lables",np.array(all_label).shape)

# dimensions of the data
d1=np.array(all_mfcc).shape[1]
d2=np.array(all_mfcc).shape[2]
d=d1*d2

In [None]:
op_mfcc=np.array(all_mfcc)     # transform list to array to be fed to the model
op_mfcc=op_mfcc.reshape(np.array(all_mfcc).shape[0],-1)   # adjust shape of the samples
op_mfcc.shape

## One hot encoding

In [None]:
#all_label = all_label.tolist()
le = LabelEncoder()
y=le.fit_transform(all_label)
classes= list(le.classes_)

## **Model based on ANN** 

In [None]:
! pip install --upgrade tensorflow
! pip install --upgrade tensorflow-gpu
! pip install keras==2.3.1

In [None]:
# Importing model libraries from keras
from keras.optimizers import SGD
from keras.constraints import maxnorm
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Flatten, Dense,Dropout
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split


In [None]:
y=tensorflow.keras.utils.to_categorical(y, num_classes=len(labels), dtype='float32')  # one hot encoded varibales to categeorical values
y.shape

In [None]:
x_tr, x_val, y_tr, y_val= train_test_split(op_mfcc,np.array(y),stratify=y,test_size = 0.2,random_state=777,shuffle=True)

In [None]:
# Inspect shapes of each set
print("shape of training samples",x_tr.shape)
print("shape of training labels",y_tr.shape)
print("shape of test samples",x_val.shape)
print("shape of test labels",y_val.shape)

### **Model Architecture**

In [None]:
#Model Architecture
model = Sequential()
model.add(Dense(100, activation='relu', input_shape=(d,), kernel_constraint=maxnorm(3)))
model.add(Dropout(0.5))
model.add(Dense(80, activation='relu', kernel_constraint=maxnorm(3)))
model.add(Dropout(0.5))
model.add(Dense(len(classes), activation='softmax' , kernel_constraint=maxnorm(3)))

In [None]:
tensorflow.keras.utils.plot_model(model, 'model.png',show_shapes=True)

In [None]:
model.compile(loss='categorical_crossentropy',optimizer='adamax',metrics=['accuracy'])


In [None]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10, min_delta=0.0001) 
mc = ModelCheckpoint('best_model.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='max')

In [None]:
history=model.fit(x_tr, y_tr,validation_data=(x_val,y_val), epochs=300, batch_size=65)

### Model Evaluation

In [None]:
train_score = model.evaluate(x_tr, y_tr, batch_size=12)
print(train_score)

print('----------------Training Complete-----------------')

test_score = model.evaluate(x_val, y_val, batch_size = 12)
print(test_score)

In [None]:
history.history.keys()

In [None]:
plt.plot(history.history['loss'], label='train')  # losses learning curve of training set.
plt.plot(history.history['val_loss'], label='test') # losses learning curve of validation set.
plt.legend()
plt.title("losses learning curves")
plt.show()

In [None]:
plt.plot(history.history['accuracy'])      # Accuracy learning curve of training set.
plt.plot(history.history['val_accuracy'])  # Accuracy learning curve of validation set.
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.title("Accuracy learning curves")
plt.show()

### **confusion matrix**

In [None]:
y_predict=model.predict(x_val)
conf_mat=tensorflow.math.confusion_matrix(np.argmax(y_val,axis=1) , np.argmax(y_predict,axis=1))

In [None]:
df_cm = pd.DataFrame(np.array(conf_mat), index = [i for i in classes],
                  columns = [i for i in classes])
plt.figure(figsize = (13,7))
ax = sn.heatmap(df_cm, annot=True)
plt.title("Confusion Matrix", fontsize=20)
plt.ylabel("True Class"     , fontsize=20)
plt.xlabel("Predicted Class", fontsize=20)
plt.show()

### Test random samples to check model performance

In [None]:
model.predict_classes(X_test)
from sklearn.metrics import classification_report
print(classification_report(y_test, to_categorical(predictions)))

In [None]:
x_val[1].shape
model.predict(x_val[1].reshape((1,d)))

Define the function that predicts text for the given audio:

In [None]:
def predict(audio):
    print(samples.shape)
    prob=model.predict(audio)
    index=np.argmax(prob[0])
    return classes[index]

Prediction time! Make predictions on the validation data:

In [None]:
import random
index=random.randint(0,len(x_val)-1)
print(index)
samples=x_val[index]
print("Audio:",classes[np.argmax(y_val[index])])
#ipd.Audio(np.array(all_wave)[index,:], rate=16000)

In [None]:
print("Text:",predict(samples.reshape(1,d)))

In [None]:
from keras.models import load_model
model.save("SR_MODEL.h5")    #loading model to be tested locally.