In [1]:
import matplotlib.pyplot as plt
import numpy as np
import librosa
import librosa.display
import IPython.display as ipd
import os 

  
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from tqdm import tqdm

Using TensorFlow backend.


In [2]:
root_dir=os.listdir("E:/speech_recognition/data/")

In [3]:
mfcc=[]
for i in root_dir:
    audio_paths=f'E:/speech_recognition/data/{i}'
    print(audio_paths)
    
    # load audio file and slice it to ensure length consistency among different files
    signal,sample_rate = librosa.load(audio_paths)
    
    # extract MFCCs
    MFCCs = librosa.feature.mfcc(signal, sample_rate)
    mfcc.append( MFCCs)                                           

E:/speech_recognition/data/audio_1.wav
E:/speech_recognition/data/audio_2.wav
E:/speech_recognition/data/audio_3.wav


In [4]:
print(mfcc[2].shape)

(20, 171)


In [5]:
def plot_mfcc_feature(vis_mfcc_feature):
    # plot the MFCC feature
    fig = plt.figure(figsize=(12,5))
    ax = fig.add_subplot(111)
    im = ax.imshow(vis_mfcc_feature, cmap=plt.cm.jet, aspect='auto')
    plt.title('Normalized MFCC')
    plt.ylabel('Time')
    plt.xlabel('MFCC Coefficient')
    divider = make_axes_locatable(ax)
    cax = divider.append_axes("right", size="5%", pad=0.05)
    plt.colorbar(im, cax=cax)
    ax.set_xticks(np.arange(0, 13, 2), minor=False);
    plt.show()

In [6]:
#plot_mfcc_feature(mfcc)

In [7]:
#labels=["আমি এসএসএল ওয়ারলেসে জব",
        #"আমি ডাটা টিমের সদস্য",
        #"আমাদের ডেটা টিমে দুইজন জমজ ভাই আছে"]


In [8]:
def get_labels():
    labels =["আমি এসএসএল ওয়ারলেসে জব",
            "আমি ডাটা টিমের সদস্য",
            "আমাদের ডেটা টিমে দুইজন জমজ ভাই আছে"] 
    #labels=os.listdir(path)
    print(len(labels))
    label_indices = np.arange(0, len(labels))
    print(label_indices)
    return labels, label_indices, to_categorical(label_indices)

In [9]:
#data_path="E:/speech_recognition/data/"
labels,label_indices,_=get_labels()
print(labels)

3
[0 1 2]
['আমি এসএসএল ওয়ারলেসে জব', 'আমি ডাটা টিমের সদস্য', 'আমাদের ডেটা টিমে দুইজন জমজ ভাই আছে']


In [10]:
# Mel-frequency cepstral coefficients
def wav2mfcc(file_path,max_len, n_mfcc):
    wave, sr = librosa.load(file_path, mono=True, sr=None)
    wave = np.asfortranarray(wave[::3])
    mfcc = librosa.feature.mfcc(wave, sr=16000, n_mfcc=n_mfcc)

    # If maximum length exceeds mfcc lengths then pad the remaining ones
    if (max_len > mfcc.shape[1]):
        pad_width = max_len - mfcc.shape[1]
        mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')

    # Else cutoff the remaining parts
    else:
        mfcc = mfcc[:, :max_len]
    
    return mfcc

In [11]:

def save_data_to_array(path, max_len, n_mfcc):
    #labels,_, _ = get_labels()

    #for label in labels:
        # Init mfcc vectors
    mfcc_vectors = []
   
    for wav in path:
        wavfile=f'E:/speech_recognition/data/{wav}'
        print(wavfile)
        mfcc = wav2mfcc(wavfile, max_len, n_mfcc)
        mfcc_vectors.append(mfcc)
    np.save('E:/speech_recognition/wav', mfcc_vectors)
    return mfcc_vectors

In [12]:
mfcc_vec=save_data_to_array(root_dir,200,20)
print(mfcc_vec[1].shape)

E:/speech_recognition/data/audio_1.wav
E:/speech_recognition/data/audio_2.wav
E:/speech_recognition/data/audio_3.wav
(20, 200)


In [13]:
x= np.load('E:/speech_recognition/wav.npy')
print(len(x))

3


In [14]:
# Getting first arrays
#X = np.load(labels[0] + '.npy')
#y = np.zeros(X.shape[0])


In [15]:
def get_train_test(split_ratio=0.6, random_state=42):
    # Get available labels
    labels, indices, _ = get_labels()

    # Getting first arrays
    X = np.load('E:/speech_recognition/wav.npy')
    y = np.zeros(X.shape[0])
    #print(y)
    # Append all of the dataset into one single array, same goes for y
    for i, label in enumerate(labels[1:]):
        x = np.load('E:/speech_recognition/wav.npy')
        X = np.vstack((X, x))
        y = np.append(y, np.full(x.shape[0], fill_value= (i + 1)))
        print(y)
    assert X.shape[0] == len(y)

    return train_test_split(X, y, test_size=0.2,shuffle=True)

In [16]:
#get_train_test(split_ratio=0.6, random_state=42)

In [17]:
X_train, X_test, y_train, y_test =train_test_split(x,label_indices,test_size=0.1,
                                                   random_state=42)

In [18]:
X_train.shape

(2, 20, 200)

In [19]:
# # Feature dimension
channels = 1
max_len = 200
buckets = 20
epochs = 48
batch_size = 100

num_classes = 3

#X_train = X_train.reshape(X_train.shape[0],buckets, max_len, channels)
#X_test = X_test.reshape(X_test.shape[0],buckets,max_len, channels)

In [20]:
print(X_train.shape,X_test.shape)
#plt.imshow(X_train[:, :, :, 0])

(2, 20, 200) (1, 20, 200)


In [21]:
print(y_train,y_test)

[1 2] [0]


In [22]:
print(y_train.shape,y_test.shape)

(2,) (1,)


In [23]:
y_train_hot = to_categorical(y_train)
y_test_hot = to_categorical(y_test)
print(y_train_hot,y_test_hot)

[[0. 1. 0.]
 [0. 0. 1.]] [[1.]]


In [24]:
y_train_hot.shape

(2, 3)

In [25]:
y_test_hot.shape

(1, 1)

In [26]:
#from preprocess import *
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D, LSTM
from keras.utils import to_categorical
#import wandb
#from wandb.keras import WandbCallback
import matplotlib.pyplot as plt

In [27]:
num_classes=3

#build a simple cnn model

model = Sequential()
model.add(Flatten(input_shape=(buckets,max_len)))
#model.add(Flatten())
model.add(Dense(3, activation='softmax'))
model.compile(loss="sparse_categorical_crossentropy",
                  optimizer="adam",
                  metrics=['accuracy'])

In [28]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 4000)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 12003     
Total params: 12,003
Trainable params: 12,003
Non-trainable params: 0
_________________________________________________________________


In [36]:
print(X_train.shape[0])
X_train = X_train.reshape(X_train.shape[0],buckets, max_len)
X_test = X_test.reshape(X_test.shape[0],buckets,max_len)

2


In [30]:
X_train.shape

(2, 20, 200)

In [31]:
X_test.shape

(1, 20, 200)

In [32]:
model.fit(X_train, y_train, epochs=epochs, 
          validation_data=(X_test,y_test),
         )

Train on 2 samples, validate on 1 samples
Epoch 1/48
Epoch 2/48
Epoch 3/48
Epoch 4/48
Epoch 5/48
Epoch 6/48
Epoch 7/48
Epoch 8/48
Epoch 9/48
Epoch 10/48
Epoch 11/48
Epoch 12/48
Epoch 13/48
Epoch 14/48
Epoch 15/48
Epoch 16/48
Epoch 17/48
Epoch 18/48
Epoch 19/48
Epoch 20/48
Epoch 21/48
Epoch 22/48
Epoch 23/48
Epoch 24/48
Epoch 25/48
Epoch 26/48
Epoch 27/48
Epoch 28/48
Epoch 29/48
Epoch 30/48
Epoch 31/48
Epoch 32/48
Epoch 33/48
Epoch 34/48
Epoch 35/48
Epoch 36/48
Epoch 37/48
Epoch 38/48
Epoch 39/48
Epoch 40/48
Epoch 41/48
Epoch 42/48
Epoch 43/48
Epoch 44/48
Epoch 45/48
Epoch 46/48
Epoch 47/48
Epoch 48/48


<keras.callbacks.callbacks.History at 0x276eb4f1b88>

In [33]:
# make a prediction
import cv2
from numpy import zeros, newaxis
#print(mfcc_vec[1].shape)


#.........take a random voice..............
mfcc_1=mfcc_vec[1][newaxis,:, :,]
print(mfcc_1.shape)
predict = model.predict_classes(mfcc_1)
print(predict)

(1, 20, 200)
[1]


In [34]:
if predict==[0]:
    print("আমি এসএসএল ওয়ারলেসে জব")
elif predict==[1]:
    print("আমি ডাটা টিমের সদস্য")
else:
    print("আমাদের ডেটা টিমে দুইজন জমজ ভাই আছে")

আমি ডাটা টিমের সদস্য


In [35]:
x=[ 5,40, 51, 81, 12, 46 ,12]
print(len(x[:2]))

2
