# Speech to Text


## Requirements

In [None]:
!pip install --upgrade torchaudio==0.5.1 torch==1.5.1+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html

## Download Data

The dataset we are using is Google's Speech Dataset (https://ai.googleblog.com/2017/08/launching-speech-commands-dataset.html).

It is composed of **"65,000 one-second long utterances of 30 short words, by thousands of different people"**.

In [None]:
!rm -rf ./*
!wget -O speech_commands_v0.01.tar.gz http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz
!tar xzf speech_commands_v0.01.tar.gz 
!ls

--2020-11-22 12:00:23--  http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz
Resolving download.tensorflow.org (download.tensorflow.org)... 108.177.125.128, 2404:6800:4008:c00::80
Connecting to download.tensorflow.org (download.tensorflow.org)|108.177.125.128|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1489096277 (1.4G) [application/gzip]
Saving to: ‘speech_commands_v0.01.tar.gz’


2020-11-22 12:00:50 (54.0 MB/s) - ‘speech_commands_v0.01.tar.gz’ saved [1489096277/1489096277]

_background_noise_  go	     on				   testing_list.txt
bed		    happy    one			   three
bird		    house    README.md			   tree
cat		    left     right			   two
dog		    LICENSE  seven			   up
down		    marvin   sheila			   validation_list.txt
eight		    nine     six			   wow
five		    no	     speech_commands_v0.01.tar.gz  yes
four		    off      stop			   zero


## Imports

In [None]:
from IPython.display import Audio

import torch
import torchaudio
import torch.nn.functional as F

import glob
import os
import random
from tqdm import tqdm_notebook
import numpy as np

# Dataset

Let's print the different classes (words) that are part of this dataset.

We can see there are 30 different words.

In [None]:
classes = [
    x for x in os.listdir()
    if not x in [
        'LICENSE', 'README.md', '_background_noise_', 'speech_commands_v0.01.tar.gz',
        'testing_list.txt', 'validation_list.txt', '.config', '.ipynb_checkpoints', 'model.pt',
        'model_cpu.pt', 'drive'
    ]
]
print(classes)
print('Number of classes', len(classes))

['six', 'nine', 'on', 'left', 'three', 'five', 'go', 'bird', 'seven', 'off', 'wow', 'two', 'stop', 'zero', 'up', 'house', 'happy', 'cat', 'sheila', 'down', 'right', 'four', 'one', 'tree', 'eight', 'bed', 'marvin', 'dog', 'yes', 'no']
Number of classes 30


### Samples

In [None]:
Audio("bed/1528225c_nohash_2.wav")

In [None]:
Audio("cat/004ae714_nohash_0.wav")

### Creating dataset for training

In [None]:
# Read the test list
with open('testing_list.txt') as testing_f:
    testing_list = [x.strip() for x in testing_f]

# Read the val list
with open('validation_list.txt') as val_f:
    validation_list = [x.strip() for x in val_f]  

print('Number of testing samples', len(testing_list))
print('Number of validation samples', len(validation_list))

# Construct a train list
training_list = []
for c in classes:
  training_list += glob.glob(c + '/*')

training_list = list(filter(lambda x: not x in testing_list and not x in validation_list, training_list))
print('Number of training samples', len(training_list))

Number of testing samples 6835
Number of validation samples 6798
Number of training samples 51088


Now, we can create a custom SpeechDataset class that takes a file list in input.

In [None]:
class SpeechDataset(torch.utils.data.Dataset):
  
    def __init__(self, classes, file_list):
        
        self.classes = classes
        
        # create a map from class name to integer
        self.class_to_int = dict(zip(classes, range(len(classes))))
        
        # store the file names
        self.samples = file_list
        
        # store MFCC transform
        self.mfcc_transform = torchaudio.transforms.MFCC(n_mfcc=13, log_mels=True)
        
    def __len__(self):
        return len(self.samples)
        
    def __getitem__(self,i):
        with torch.no_grad():
            # load a normalized waveform
            waveform, _ = torchaudio.load(self.samples[i], normalization=True)
            
            # if the waveform is too short (less than 1 second) we pad it with zeroes
            if waveform.shape[1] < 16000:
                waveform = F.pad(input=waveform, pad=(0, 16000 - waveform.shape[1]), mode='constant', value=0)
            
            # then, we apply the transform
            mfcc = self.mfcc_transform(waveform).squeeze(0).transpose(0,1)
        
        # get the label from the file name
        label = self.samples[i].split('/')[0]
        
        # return the mfcc coefficient with the sample label
        return mfcc, self.class_to_int[label]

## Create instances of the SpeechDataset for the train and val sets

Create your Dataset objects for training.

In [None]:
train_set = SpeechDataset(classes, training_list)
val_set =SpeechDataset(classes, validation_list)

print(train_set[5][0].shape)

torch.Size([81, 13])


## Create Dataloaders for training and validation

Create DataLoaders with the Datasets you just created.

Do not forget to add shuffling to the training DataLoader.

Print a batch of data to make sure everything works.

In [None]:
train_dl = torch.utils.data.DataLoader(train_set, batch_size=16, shuffle=True)
val_dl = torch.utils.data.DataLoader(val_set, batch_size=16)

# Model

We are going to use **GRUs**

In [None]:
device = 'cuda'

In [None]:
class SpeechRNN(torch.nn.Module):
  
    def __init__(self):
        super(SpeechRNN, self).__init__()
        
        self.gru = torch.nn.GRU(
            input_size=13, hidden_size=256, 
            num_layers=2, batch_first=True
        )
        self.out_layer = torch.nn.Linear(256, 30)
        self.softmax = torch.nn.LogSoftmax(dim=1)
        
    def forward(self, x):
        out, _ = self.gru(x)
        x = self.out_layer(out[:, -1, :])
        return self.softmax(x)

In [None]:
model = SpeechRNN().to(device)
batch = next(iter(train_dl))[0]
print(batch.shape)
y = model(batch.to(device))

print(y.shape)

torch.Size([16, 81, 13])
torch.Size([16, 30])


# Model Training

In [None]:
model = SpeechRNN().to(device)

criterion = torch.nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
def train(loader):
    model.train()
    train_loss = 0
    train_acc = 0

    for i, batch in enumerate(tqdm_notebook(loader)):
        x = batch[0].to(device)
        labels = batch[1].to(device)

        # Compute the network output, loss and gradients
        y = model(x)
        loss = criterion(y, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Compute some statistics
        with torch.no_grad():
            train_loss += loss.item()
            train_acc += (y.max(1)[1] == labels).sum().item()
        
    return train_loss / len(train_set), train_acc / len(train_set)

In [None]:
def eval(loader):
    model.eval()
    val_loss = 0
    val_acc = 0
    
    for i, batch in enumerate(loader):
        with torch.no_grad():
            x = batch[0].to(device)
            labels = batch[1].to(device)

            # Compute the network output and loss
            y = model(x)
            loss = criterion(y, labels)
            
            val_loss += loss.item()
            val_acc += (y.max(1)[1] == labels).sum().item()
    
    return val_loss / len(val_set), val_acc / len(val_set)

In [None]:
n_epochs = 10
epoch_loss, epoch_acc, epoch_val_loss, epoch_val_acc = [], [], [], []

for epoch in range(1, n_epochs + 1):
    print('Epoch:', epoch)

    # Training
    train_loss, train_acc = train(train_dl)
    print(
        f'Training accuracy: {train_acc:.2f}',
        f'Training loss: {train_loss:.4f}',
    )
    epoch_loss.append(train_loss)
    epoch_acc.append(train_acc)
    
    # Validation
    val_loss, val_acc = eval(val_dl)
    print(
        f'Validation accuracy: {val_acc:.2f}',
        f'Validation loss: {val_loss:.4f}',
    )

    # Save best model
    if len(epoch_val_acc) == 0 or val_acc > max(epoch_val_acc):
        torch.save(model.state_dict(), 'model.pt')

    epoch_val_loss.append(val_loss)
    epoch_val_acc.append(val_acc)

Epoch: 1


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=3193.0), HTML(value='')))


Training accuracy: 0.82 Training loss: 0.0386
Validation accuracy: 0.89 Validation loss: 0.0220
Epoch: 2


HBox(children=(FloatProgress(value=0.0, max=3193.0), HTML(value='')))


Training accuracy: 0.92 Training loss: 0.0168
Validation accuracy: 0.90 Validation loss: 0.0210
Epoch: 3


HBox(children=(FloatProgress(value=0.0, max=3193.0), HTML(value='')))


Training accuracy: 0.93 Training loss: 0.0141
Validation accuracy: 0.90 Validation loss: 0.0201
Epoch: 4


HBox(children=(FloatProgress(value=0.0, max=3193.0), HTML(value='')))


Training accuracy: 0.94 Training loss: 0.0128
Validation accuracy: 0.91 Validation loss: 0.0193
Epoch: 5


HBox(children=(FloatProgress(value=0.0, max=3193.0), HTML(value='')))


Training accuracy: 0.94 Training loss: 0.0119
Validation accuracy: 0.91 Validation loss: 0.0189
Epoch: 6


HBox(children=(FloatProgress(value=0.0, max=3193.0), HTML(value='')))


Training accuracy: 0.94 Training loss: 0.0116
Validation accuracy: 0.91 Validation loss: 0.0189
Epoch: 7


HBox(children=(FloatProgress(value=0.0, max=3193.0), HTML(value='')))


Training accuracy: 0.95 Training loss: 0.0109
Validation accuracy: 0.91 Validation loss: 0.0193
Epoch: 8


HBox(children=(FloatProgress(value=0.0, max=3193.0), HTML(value='')))


Training accuracy: 0.95 Training loss: 0.0110
Validation accuracy: 0.92 Validation loss: 0.0180
Epoch: 9


HBox(children=(FloatProgress(value=0.0, max=3193.0), HTML(value='')))


Training accuracy: 0.95 Training loss: 0.0107
Validation accuracy: 0.91 Validation loss: 0.0211
Epoch: 10


HBox(children=(FloatProgress(value=0.0, max=3193.0), HTML(value='')))


Training accuracy: 0.95 Training loss: 0.0108
Validation accuracy: 0.91 Validation loss: 0.0194


## Evaluation

In [None]:
# Load best model
model = SpeechRNN().to(device)
model.load_state_dict(torch.load('model.pt'))

<All keys matched successfully>

Save model for exporting

In [None]:
model_cpu = model.to('cpu')
torch.save(model_cpu.state_dict(), 'model_cpu.pt')

Start evaluation

In [None]:
test_dataset = SpeechDataset(classes, testing_list)
test_dl = torch.utils.data.DataLoader(test_dataset, batch_size=64)

model.eval()
test_loss = 0
test_accuracy = 0
preds, y_test = np.array([]), np.array([])

for i, batch in enumerate(test_dl):
    with torch.no_grad():
        x = batch[0].to(device)
        labels = batch[1].to(device)

        y = model(x)
        loss = criterion(y, labels)
        preds = np.hstack([preds, y.max(1)[1].cpu().numpy()])
        y_test = np.hstack([y_test, labels.cpu().numpy()])

        test_loss += loss.item()
        test_accuracy += (y.max(1)[1] == labels).sum().item()

test_loss /= len(test_dataset)
test_accuracy /= len(test_dataset)
print(
    f'Test accuracy: {test_accuracy:.2f}',
    f'Test loss: {test_loss:.4f}',
)

Test accuracy: 0.92 Test loss: 0.0046


### One-shot Inference

In [None]:
def create_mfcc(audio_path):
    waveform, fs = torchaudio.load(audio_path, normalization=True)
    
    if waveform.shape[1] < 16000:
        waveform = F.pad(input=waveform, pad=(0, 16000 - waveform.shape[1]), mode='constant', value=0)
    
    if fs != 16000:
        waveform = torchaudio.transforms.Resample(fs, 16000)(waveform)
    
    return torchaudio.transforms.MFCC(n_mfcc=13, log_mels=True)(waveform).transpose(1, 2)

In [None]:
def get_prediction(path):
    mfcc = create_mfcc(path).to(device)

    model.eval()
    output = model(mfcc).max(1)[1].item()

    return classes[output]

In [None]:
get_prediction('bird/6f1ffef6_nohash_0.wav')