# Table of Contents

### 1. Model Inference
### 2. Real-time Speech Command Identification

## 1. Model Inference

In [1]:
# import modules and libraries
%matplotlib inline

import os
import time
import wave
import pyaudio
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import Audio

# import pytorch related modules
import torch
import torchaudio

torchaudio.set_audio_backend("soundfile")

from torchaudio.transforms import Resample
from torch.nn import ConstantPad1d

from torch.nn import Sequential
from torch.nn import Conv1d, ReLU, MaxPool1d, Dropout, Linear, Flatten

In [2]:
# convert predicted labels to respective classes
labels_to_classes = {
    0: 'up',
    1: 'down',
    2: 'left',
    3: 'right'
}

In [3]:
# define model architecture

# Input

# Conv1D(num_filters=64, filter_size=13, stride=1, activation='relu')
# MaxPool(pool_size=3)
# Conv1D(num_filters=64, filter_size=13, stride=1, activation='relu')
# MaxPool(pool_size=3)
# Conv1D(num_filters=64, filter_size=13, stride=1, activation='relu')
# MaxPool(pool_size=3)
# Conv1D(num_filters=64, filter_size=13, stride=1, activation='relu')
# MaxPool(pool_size=3)

# Flatten
# Dense(num_neurons=256, activation='relu')
# Dropout(0.3)
# Dense(num_neurons=32, activation='relu')
# Dropout(0.3)
# Dense(num_neurons=4, activation='softmax')


model = Sequential(
    Conv1d(in_channels=1, out_channels=64, kernel_size=13),
    ReLU(),
    MaxPool1d(4),

    Conv1d(in_channels=64, out_channels=64, kernel_size=13),
    ReLU(),
    MaxPool1d(4),

    Conv1d(in_channels=64, out_channels=64, kernel_size=13),
    ReLU(),
    MaxPool1d(4),

    Conv1d(in_channels=64, out_channels=64, kernel_size=13),
    ReLU(),
    MaxPool1d(4),

    Flatten(),

    Linear(1728, 256),
    ReLU(),
    Dropout(0.3),
    
    Linear(256, 32),
    ReLU(),
    Dropout(0.3),

    Linear(32, 4)
)

In [4]:
# load pytorch model on cpu
model.load_state_dict(torch.load("saved_model.pth", map_location="cpu"))

<All keys matched successfully>

In [5]:
# load wave file
waveform, sample_rate = torchaudio.load("up.wav")

In [6]:
# play audio 
Audio(waveform, rate=sample_rate)

In [7]:
# preprocess audio
if waveform.shape[1] > sample_rate:
    waveform = waveform[:, :sample_rate]
else:
    pad_len = sample_rate - waveform.shape[1]
    waveform = ConstantPad1d((0, pad_len), 0)(waveform)

waveform = Resample(orig_freq=sample_rate, new_freq=8000)(waveform)

waveform = waveform - waveform.min()
waveform = waveform / waveform.max()

waveform = torch.unsqueeze(waveform, dim=1)

In [8]:
waveform.shape, waveform.min(), waveform.max()

(torch.Size([1, 1, 8000]), tensor(0.), tensor(1.))

In [9]:
# get inference from model
model.eval()
output = model(waveform).detach()
output.shape

torch.Size([1, 4])

In [10]:
# convert logits to labels
output = torch.log_softmax(output, dim=1)
_, pred = torch.max(output, dim = 1)   
pred

tensor([0])

In [11]:
# get respective class
labels_to_classes[pred.numpy()[0]]

'up'

## 2. Real-time Speech Command Identification

In [13]:
# define variables
chunk = 1024
channels = 1
sample_rate = 16000
record_duration = 1
audio_format = pyaudio.paInt16

threshold = 500

In [14]:
# create pyaudio instance
p = pyaudio.PyAudio()

# create stream object
stream = p.open(
    format=audio_format,
    channels=channels,
    rate=sample_rate,
    input=True,
    frames_per_buffer=chunk,
)

In [15]:
def detect_voice(frame):
    avg_value = np.average(np.abs(frame))
    if avg_value > threshold:
        return True
    else:
        return False

In [16]:
def write_to_file(recording):
    filename = 'temp.wav'

    wf = wave.open(filename, 'wb')
    wf.setnchannels(channels)
    wf.setsampwidth(p.get_sample_size(audio_format))
    wf.setframerate(sample_rate)
    wf.writeframes(recording)
    wf.close()
    
    return filename

In [17]:
def predict(filename):
    # read audio file
    waveform, sample_rate = torchaudio.load(filename)
    
    # preprocess
    if waveform.shape[1] > sample_rate:
        waveform = waveform[:, :sample_rate]
    else:
        pad_len = sample_rate - waveform.shape[1]
        waveform = ConstantPad1d((0, pad_len), 0)(waveform)

    waveform = Resample(orig_freq=sample_rate, new_freq=8000)(waveform)

    waveform = waveform - waveform.min()
    waveform = waveform / waveform.max()

    waveform = torch.unsqueeze(waveform, dim=1)

    # get prediction
    model.eval()
    output = model(waveform).detach()
    output = torch.log_softmax(output, dim=1)
    _, pred = torch.max(output, dim = 1)   
    print('You said', labels_to_classes[pred.numpy()[0]])

In [18]:
def record():
    print('Voice detected - begin to record')
    waveform = []
    current = time.time()
    end = time.time() + record_duration
    while current <= end:
        data = stream.read(chunk)
        waveform.append(data)
        current = time.time()
    filename = write_to_file(b''.join(waveform))
    predict(filename)
    print('Return to listening\n\n')

In [19]:
print('Start listening ...')
while True:
    frame = np.frombuffer(stream.read(chunk), dtype=np.int16)
    if detect_voice(frame):
        record()

Start listening ...
Voice detected - begin to record
You said left
Return to listening


Voice detected - begin to record
You said up
Return to listening


Voice detected - begin to record
You said right
Return to listening


Voice detected - begin to record
You said up
Return to listening


Voice detected - begin to record
You said right
Return to listening


Voice detected - begin to record
You said up
Return to listening


Voice detected - begin to record
You said right
Return to listening


Voice detected - begin to record
You said left
Return to listening


Voice detected - begin to record
You said up
Return to listening




KeyboardInterrupt: 

In [20]:
# close stream
stream.stop_stream()
stream.close()
p.terminate()