In [1]:
# tutorial: https://www.analyticsvidhya.com/blog/2019/07/learn-build-first-speech-to-text-model-python/
# data: https://www.kaggle.com/c/tensorflow-speech-recognition-challenge

In [2]:
import numpy as np
import librosa
import matplotlib.pyplot as plt
import IPython.display as ipd
import os

from scipy.io import wavfile
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
from keras.layers import Dense, Dropout, Flatten, Conv1D, Conv2D, Input, MaxPooling1D, MaxPooling2D
from keras.models import Model
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import backend as K
from keras.optimizers import Adam

ModuleNotFoundError: No module named 'librosa'

In [None]:
train_audio_path = './data/train/audio/'
sample_file = train_audio_path + 'yes/0a7c2a8d_nohash_0.wav'

samples, sample_rate = librosa.load(sample_file, sr = 16000)
print(type(samples))
print(samples.shape)
print(sample_rate)

fig = plt.figure(figsize=(14, 8))
ax1 = fig.add_subplot(211)
ax1.set_title('Raw wave of ' + sample_file)
ax1.set_xlabel('time')
ax1.set_ylabel('Amplitude')
x = np.linspace(0, sample_rate/len(samples), sample_rate)
y = samples
ax1.plot(x, y)

In [None]:
print(sample_rate)
ipd.Audio(samples, rate=sample_rate)

In [None]:
samples = librosa.resample(samples, orig_sr=sample_rate, target_sr=8000)
ipd.Audio(samples, rate=8000)

In [None]:
labels = os.listdir(train_audio_path)

#find count of each label and plot bar graph
no_of_recordings = []
for label in labels:
    waves = [f for f in os.listdir(train_audio_path + '/'+ label) if f.endswith('.wav')]
    no_of_recordings.append(len(waves))
    
#plot
plt.figure(figsize=(30,5))
index = np.arange(len(labels))
plt.bar(index, no_of_recordings)
plt.xlabel('Commands', fontsize=12)
plt.ylabel('No of recordings', fontsize=12)
plt.xticks(index, labels, fontsize=15, rotation=60)
plt.title('No. of recordings for each command')
plt.show()

# labels=["yes", "no", "up", "down", "left", "right", "on", "off", "stop", "go"]

In [None]:
duration_of_recordings = []
for label in labels:
    wavs = [f for f in os.listdir(train_audio_path + '/'+ label) if f.endswith('.wav')]
    for wav in wavs:
        sample_rate, samples = wavfile.read(train_audio_path + '/' + label + '/' + wav)
        duration_of_recordings.append(float(len(samples)/sample_rate))

plt.hist(np.array(duration_of_recordings), bins=[0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2])

In [None]:
all_wave = []
all_label = []
for label in labels:
    print(label)
    waves = [f for f in os.listdir(train_audio_path + '/'+ label) if f.endswith('.wav')]
    for wav in waves:
        samples, sample_rate = librosa.load(train_audio_path + '/' + label + '/' + wav, sr=16000)
        samples = librosa.resample(samples, orig_sr=sample_rate, target_sr=8000)
        if(len(samples) == 8000):
            all_wave.append(samples)
            all_label.append(label)

In [None]:
le = LabelEncoder()
y = le.fit_transform(all_label)
classes = list(le.classes_)

In [None]:
# one hot encode
y = np_utils.to_categorical(y, num_classes=len(labels))

In [None]:
# Reshape the 2D array to 3D since the input to the conv1d must be a 3D array
all_wave = np.array(all_wave).reshape(-1,8000,1)

In [None]:
x_train, x_val, y_train, y_val = train_test_split(np.array(all_wave),
                                                  np.array(y),
                                                  stratify=y,
                                                  test_size = 0.2,
                                                  random_state=777,
                                                  shuffle=True)

In [None]:
print(x_train.shape)

In [None]:
K.clear_session()

input_layer = Input(shape=(8000,1))

x = Conv1D(8,13, padding='valid', activation='relu', strides=1)(input_layer)
x = MaxPooling1D(3)(x)
x = Dropout(0.3)(x)

x = Conv1D(16, 11, padding='valid', activation='relu', strides=1)(x)
x = MaxPooling1D(3)(x)
x = Dropout(0.3)(x)

x = Conv1D(32, 9, padding='valid', activation='relu', strides=1)(x)
x = MaxPooling1D(3)(x)
x = Dropout(0.3)(x)

x = Conv1D(64, 7, padding='valid', activation='relu', strides=1)(x)
x = MaxPooling1D(3)(x)
x = Dropout(0.3)(x)

x = Flatten()(x)

x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)

x = Dense(128, activation='relu')(x)
x = Dropout(0.3)(x)

x = Dense(len(labels), activation='softmax')(x)

model = Model(inputs=input_layer, outputs=x)

model.summary()

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10, min_delta=1e-4)
mc = ModelCheckpoint('best_model.hdf5', monitor='val_accuracy', verbose=0, save_best_only=True, mode='max')

In [None]:
history = model.fit(x_train, y_train ,epochs=100, callbacks=[es,mc], batch_size=32, validation_data=(x_val,y_val))

In [None]:
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show()