<a href="https://colab.research.google.com/github/stefancosquer/deep-speech-command/blob/master/Biogen_Speech_Command.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Intro

## Objectif

Reconnaitre les chiffres de 1 à 9 depuis un flux audio continu.

## Etapes

*  **Création du dataset** : ensemble de fichiers wave contenant la prononciation d'un seul chiffre.
*  **Entrainement du réseau** de neurones avec Keras
*  **Conversion du réseau** vers TensorFlow Lite
*  **Création d'un module React Native** réalisant la capture audio et l'inférence

# Création du dataset

In [0]:
!apt-get install python3-pyaudio
!pip install pyaudio

import os
os.makedirs('speech/recordings')

In [0]:
import pyaudio
import math
import time
import wave
import os
from scipy.io.wavfile import read, write
from collections import defaultdict

FORMAT = pyaudio.paInt16
CHANNELS = 1
SAMPLERATE = 44100
BUFFER = 1024
DELAY_BETWEEN_NUMBERS = 3
REPEATS_PER_NUMBER = 1

p = pyaudio.PyAudio()

name = input("Votre nom: ")

recording = 'speech/recordings/speech_' + name + '.wav'
wavefile = wave.open(recording, 'wb')
wavefile.setnchannels(CHANNELS)
wavefile.setsampwidth(pyaudio.get_sample_size(FORMAT))
wavefile.setframerate(SAMPLERATE)


def record(in_data, frame_count, time_info, status_flags):
    wavefile.writeframes(in_data)
    return (None, pyaudio.paContinue)


def record_numbers():
    nums = [str(i) for i in range(1, 10) for set_num in range(REPEATS_PER_NUMBER)]
    for i in range(len(nums)):
        target = int(round(math.pi * i)) % len(nums)
        (nums[i], nums[target]) = (nums[target], nums[i])

    stream = p.open(format=FORMAT, channels=CHANNELS, rate=SAMPLERATE,
                    input=True, output=False,
                    frames_per_buffer=BUFFER,
                    stream_callback=record)

    print("Pret ?")
    time.sleep(DELAY_BETWEEN_NUMBERS)

    for num in nums:
        print(num)
        time.sleep(DELAY_BETWEEN_NUMBERS)

    stream.stop_stream()
    stream.close()
    p.terminate()
    wavefile.close()

    return nums


def trim(data):
    start = 0
    end = len(data) - 1

    mag = abs(data)
    thresold = mag.max() * 0.2

    for idx, point in enumerate(mag):
        if point > thresold:
            start = max(start, idx - 4410)
            break

    for idx, point in enumerate(mag[::-1]):
        if point > thresold:
            end = min(end, len(data) - idx + 4410)
            break

    return data[start:end]


def split(numbers):

    rate, data = read(recording)

    counts = defaultdict(lambda: 0)

    for i, label in enumerate(numbers):
        label = str(label)
        start_idx = (i + 1) * int(SAMPLERATE * DELAY_BETWEEN_NUMBERS)
        stop_idx = start_idx + int(SAMPLERATE * DELAY_BETWEEN_NUMBERS)

        digit = data[start_idx:stop_idx]
        digit = trim(digit)

        write('speech/recordings' + os.sep + label + "_" + name + "_" + str(counts[label]) + '.wav', SAMPLERATE, digit)

        counts[label] += 1

    os.remove(recording)


split(record_numbers())

# Apprentissage

In [44]:
!rm -rf deep-speech-command
!git clone https://github.com/stefancosquer/deep-speech-command.git

Cloning into 'deep-speech-command'...
remote: Enumerating objects: 35, done.[K
remote: Counting objects: 100% (35/35), done.[K
remote: Compressing objects: 100% (25/25), done.[K
remote: Total 35 (delta 7), reused 32 (delta 4), pack-reused 0[K
Unpacking objects: 100% (35/35), done.


In [0]:
import os
import random

import numpy as np
import matplotlib.pyplot as plt
from keras.callbacks import EarlyStopping
from keras.layers import MaxPool1D, Conv1D, Dropout, BatchNormalization, GlobalAvgPool1D
from keras.models import Sequential
from keras.layers.core import Dense, Flatten
from keras.optimizers import Adam
from keras.utils import to_categorical
from scipy.io.wavfile import read, write

SAMPLERATE = 44100
CLASSES = ['-', '1', '2', '3', '4', '5', '6', '7', '8', '9']

model = Sequential()

model.add(Conv1D(64, 88, activation='relu', input_shape=(SAMPLERATE, 1)))
model.add(MaxPool1D(3))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Conv1D(64, 4, activation='relu'))
model.add(MaxPool1D(3))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Conv1D(128, 4, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.4))
model.add(GlobalAvgPool1D())
model.add(Dense(10, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()

## Train

def generator(directory, batch_size, shuffle=False):
    rate, background = read('deep-speech-command/backgrounds/hospital.wav')
    files = [file for file in os.listdir(directory) if file.endswith(".wav")]
    np.random.shuffle(files)
    x = np.zeros((batch_size, SAMPLERATE, 1))
    y = np.zeros((batch_size, len(CLASSES)))
    while True:
        if (shuffle):
            np.random.shuffle(files)
        for i in range(batch_size):
            # choose random file
            rate, samples = read(directory + '/' + files[i % len(files)])
            # pad randomly before and after
            before = 1 # random.randint(0, SAMPLERATE - samples.size)
            after = SAMPLERATE - samples.size - before
            samples = np.concatenate((np.zeros(before), samples, np.zeros(after)))
            # mix with background sounds
            idx = np.random.randint(0, len(background) - SAMPLERATE)
            samples = samples + np.random.random() * background[idx:idx + SAMPLERATE]
            # normalize sound
            samples = (samples - np.mean(samples)) / np.std(samples)
            x[i] = samples.reshape(1, SAMPLERATE, 1)
            y[i] = to_categorical(files[i % len(files)][0], len(CLASSES))
        yield x, y

history = model.fit_generator(
    generator('deep-speech-command/recordings', 40, shuffle=True),
    validation_data=generator('deep-speech-command/recordings', 20),
    steps_per_epoch=1,
    validation_steps=1,
    epochs=1000, 
    callbacks=[EarlyStopping(patience=100, restore_best_weights=True)],
    verbose=1)

## Evaluate

plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

## Test

def test(file):
    rate, samples = read('deep-speech-command/recordings/' + file)
    samples = (samples - np.mean(samples)) / np.std(samples)
    samples = np.concatenate((samples, np.zeros(SAMPLERATE - samples.size)))
    samples = samples.reshape(1, SAMPLERATE, 1)
    predictions = model.predict(samples)
    print(file + ' : ' + str(np.argmax(predictions)) + ' (' + str(np.amax(predictions) * 100) + '%)')


test('0_stefan_0.wav')
test('1_stefan_0.wav')
test('2_stefan_0.wav')
test('3_stefan_0.wav')
test('4_stefan_0.wav')
test('5_stefan_0.wav')
test('6_stefan_0.wav')
test('7_stefan_0.wav')
test('8_stefan_0.wav')
test('9_stefan_0.wav')

model.save('model.h5')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_115 (Conv1D)          (None, 44013, 64)         5696      
_________________________________________________________________
max_pooling1d_78 (MaxPooling (None, 14671, 64)         0         
_________________________________________________________________
batch_normalization_115 (Bat (None, 14671, 64)         256       
_________________________________________________________________
dropout_115 (Dropout)        (None, 14671, 64)         0         
_________________________________________________________________
conv1d_116 (Conv1D)          (None, 14668, 64)         16448     
_________________________________________________________________
max_pooling1d_79 (MaxPooling (None, 4889, 64)          0         
_________________________________________________________________
batch_normalization_116 (Bat (None, 4889, 64)          256       
__________

# Test de la commande vocale

# Reférences

*   https://www.dlology.com/blog/how-to-do-real-time-trigger-word-detection-with-keras/
*   https://github.com/Jakobovski/free-spoken-digit-dataset
*  https://github.com/datascienceinc/speech-commands-oow2018

