# Keyword Classification on Google Speech Commands Dataset

This project explores and processes the [Google Speech Commands Dataset](https://arxiv.org/abs/1804.03209) to build and train a model for recognizing speech commands. The notebook provides step-by-step data preprocessing, feature extraction, model definition, and training.

## Project Structure

- **`GSpeechComd_Dataset_with_Comments.ipynb`**: The main Jupyter Notebook file that includes code, detailed comments, and explanations for processing the dataset and training a model. Different CNN architectures are trained and evaluated.
-[Google Speech Commands Dataset](https://ai.googleblog.com/2017/08/launching-speech-commands-dataset.html): The Google Speech Commands dataset, which contains short audio clips of spoken words, used as the primary data source.

## Features

- **Preprocessing**: Loading and preparing the dataset for training.
- **Feature Extraction**: Applying signal processing techniques to extract features from the audio data.
- **Model Training**: Using a machine learning framework (e.g., TensorFlow, PyTorch) to train a speech recognition model.
- **Model Evaluation**: Accuracy calculations for the models performance.


## Acknowledgments

This project was completed with assistance from various sources:
1. The [Google Speech Commands Dataset](https://ai.googleblog.com/2017/08/launching-speech-commands-dataset.html) for the dataset and initial problem inspiration.
2. Online resources and tutorials related to audio processing and deep learning. Particularly: [Valerio Velardo's Audio Signal Processing for Machine Learning Course](https://youtube.com/playlist?list=PL-wATfeyAMNqIee7cH3q1bh4QJFAaeNv0&si=AwcQMXpYCuYQE9wV)
3. The `ChatGPT` AI assistant by OpenAI for guidance and feedback in enhancing the notebook and documentation.

In [None]:
import torch
import torchaudio
from torchaudio.datasets import SPEECHCOMMANDS
from torchaudio.transforms import Resample, MelSpectrogram, AmplitudeToDB, PitchShift
from torch.utils.data import DataLoader, TensorDataset, Subset, ConcatDataset
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import librosa
import IPython.display as ipd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import random

In [None]:
#Simple functions for playing audio and showing spectrograms
play = lambda x : ipd.display(ipd.Audio(x,rate=16000))
def show(spec, title='Spectrogram'):
    plt.figure(figsize=(20, 8))
    plt.imshow(spec, aspect='auto', origin='lower')
    plt.title(title)
    plt.xlabel('Time')
    plt.ylabel('Frequency')
    plt.colorbar(format='%+2.0f dB')
    plt.show()

In [None]:
#Data set and choosing the labels to classfy. More/less classes could be choosen, you only need to change num_class variable
data_path = "Google Speech Comands/"
classes_set = {"yes", "no", "up", "down", "left", "right", "on", "off", "stop", "go", "zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}
classes = ["yes", "no", "up", "down", "left", "right", "on", "off", "stop", "go", "zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]

In [None]:
def label_to_index(word):
    return torch.tensor(classes.index(word))
def index_to_label(index):
    return classes[index]

In [None]:
#A subset initilization for not classifying the whole labels
class SubsetSpeechCommands(SPEECHCOMMANDS):
    def __init__(self, *args, transform = None, augment = False, classes, **kwargs):
        super().__init__(*args, **kwargs)
        self.transform = transform
        self.augment = augment
        self.classes = classes
        self.filtered_paths = [path for path in self._walker if self._load_class(path) in self.classes]
        
    def __getitem__(self, n):
        path = self.filtered_paths[n]
        label = label_to_index(self._load_class(path))
        waveform, sample_rate = torchaudio.load(path)
        if self.augment:
            # Frequency Shifting
            n_steps = random.randint(-4, 4)
            waveform = torch.Tensor(librosa.effects.pitch_shift(waveform.numpy(), sr=sample_rate, n_steps=n_steps))
            # Adding noise    
            noise = torch.randn_like(waveform)
            waveform = waveform + 0.005 * noise
        else:
            pass
        spec = self.transform(waveform)
        return spec, label
    
    def __len__(self):
        return len(self.filtered_paths)

    def _load_class(self, path):
        return path.split('\\')[-2]

In [None]:
def spec_pad(x,length):
    pad_len = length - x.size()[2]
    return F.pad(x,(0,pad_len),value=-80)

In [None]:
def mmnorm(spectrogram):
    min_val = torch.min(spectrogram)
    max_val = torch.max(spectrogram)
    normalized_spec = (spectrogram - min_val) / (max_val - min_val)
    return normalized_spec

***Model and Learning***

***Model - CNN***

In [None]:
class CNNWithDropout(nn.Module):
    def __init__(self, num_classes):
        super(CNNWithDropout, self).__init__()
        
        # Convolutional layers
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=(3, 3), stride=1, padding=1)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=(3, 3), stride=1, padding=1)
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=(3, 3), stride=1, padding=1)
        
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        
        # Fully connected layers with Dropout
        self.fc1 = nn.Linear(128 * 16 * 10, 512)  # Adjust the input size based on the output of the last pooling layer
        self.drop1 = nn.Dropout(p=0.5)  # Dropout layer with 50% probability
        self.fc2 = nn.Linear(512, num_classes)
    
    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = F.relu(self.conv3(x))
        x = self.pool(x)
        
        x = x.view(x.size(0), -1)  # Flatten the tensor for fully connected layers
        x = F.relu(self.fc1(x))
        x = self.drop1(x)  # Apply dropout
        x = self.fc2(x)
        
        return x

# Instantiate the model
model_with_dropout = CNNWithDropout(num_classes=20)

In [None]:
#Transform and the dataloader

res = torchaudio.transforms.Resample(orig_freq=16000, new_freq=8000)
mel = torchaudio.transforms.MelSpectrogram(sample_rate=8000, n_mels=128, n_fft=400, hop_length =100)
db = torchaudio.transforms.AmplitudeToDB(top_db=70)
transform = lambda x : spec_pad(mmnorm(db(mel(res(x)))),81)


def create_dataloader(train_size=3000, batch_size=32,augmented = False):
    train_dataset = SubsetSpeechCommands(data_path,download=False, subset='training', transform=transform, classes=classes_set)
    lengtht = len(train_dataset)
    random.seed(42)
    random_samples = random.sample(range(0, lengtht), train_size)
    subset = Subset(train_dataset, random_samples)

    #Applying data augmentation if the augmented parameter is True:
    if augmented:
        train_dataset_aug = SubsetSpeechCommands(data_path,download=False, subset='training', transform=transform, classes=classes_set, augment = True)
        random_samples = random.sample(range(0, lengtht), 3000)
        subset_aug = Subset(train_dataset_aug, random_samples)
        subset = ConcatDataset([subset,subset_aug])
    else:
        pass
    train_dataloader = DataLoader(subset, batch_size=32, shuffle=True)
    return train_dataloader

In [None]:
# Create valid_dataset
valid_dataset = SubsetSpeechCommands(data_path,download=False, subset='validation', transform=transform, classes=classes_set)
lengthv = len(valid_dataset)
random.seed(42)
random_samples = random.sample(range(0, lengthv), 3000)
subset = Subset(valid_dataset, random_samples)
valid_dataloader = DataLoader(subset, batch_size=32, shuffle=False)

In [None]:
def train(model):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters())
    
    num_epochs = 50
    patience = 5  # Number of epochs to wait before stopping if no improvement
    min_delta = 0.0001  # Minimum change in the monitored metric to qualify as an improvement
    best_loss = float('inf')
    counter = 0
    
    best_model_wts = None  # To store the best model weights
    
    for epoch in range(num_epochs):
        model.train()
        for inputs, labels in train_dataloader:
            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}')
        
        # Validate the model
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for inputs, labels in valid_dataloader:
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
        
        val_loss /= len(valid_dataloader)
        print(f'Epoch [{epoch+1}/{num_epochs}], Validation Loss: {val_loss:.4f}')
        
        # Early stopping check and model saving
        if val_loss < best_loss - min_delta:
            best_loss = val_loss
            counter = 0  # Reset counter if there's an improvement
            best_model_wts = model.state_dict()  # Save the best model weights
            print("Best model saved")
        else:
            counter += 1
            if counter >= patience:
                print("Early stopping triggered")
                break
    
    # After training is complete, load the best model weights
    if best_model_wts is not None:
        model.load_state_dict(best_model_wts)
        print("Loaded the best model weights from epoch with lowest validation loss.")
    return model

Model Evaluations - result below, also you can run the codes in your local  
First I trained with a smaller train subset to experiment with parameters. train_size parameter defines the size of the subset of train_set

In [None]:
model_with_dropout = CNNWithDropout(num_classes=20)
train_dataloader = create_dataloader(train_size=3000, batch_size=64,augmented = False)
model_with_dropout = train(model_with_dropout)
eval(model_with_dropout)

In [None]:
model_with_dropout = CNNWithDropout(num_classes=20)
train_dataloader = create_dataloader(train_size=6000, batch_size=64,augmented = False)
model_with_dropout = train(model_with_dropout)
eval(model_with_dropout)

In [None]:
model_with_dropout = CNNWithDropout(num_classes=20)
train_dataloader = create_dataloader(train_size=3000, batch_size=64,augmented = True)
model_with_dropout = train(model_with_dropout)
eval(model_with_dropout)

Final training with whole train (with augmentation) set with choosen classes

In [None]:
train_dataset = SubsetSpeechCommands(data_path,download=False, subset='training', transform=transform, classes=classes_set)
train_dataset_aug = SubsetSpeechCommands(data_path,download=False, subset='training', transform=transform, classes=classes_set, augment = True)
dataset = ConcatDataset([train_dataset,train_dataset_aug])
train_dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [None]:
model_with_dropout = CNNWithDropout(num_classes=20)
model_with_dropout = train(model_with_dropout)
eval(model_with_dropout)

Saving the models parameters, you can download the parameters from the github, no need to train it again.

In [None]:
model = CNNWithDropout(num_classes=20)
model.load_state_dict(torch.load('mode_0.1.pth'))

In [None]:
test_dataset = SubsetSpeechCommands(data_path,download=False, subset='test', transform=transform, classes=classes_set)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
def eval(model):
    all_preds = []
    all_labels = []

    model.eval()
    with torch.no_grad():
        for data, labels in test_dataloader:
            outputs = model(data)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    accuracy = accuracy_score(all_labels, all_preds)
    print(accuracy)

In [None]:
eval(model_with_dropout)

In [None]:
#You can examine the confusion matrix
cm = confusion_matrix(all_labels, all_preds)
cm = pd.DataFrame(cm, labels=classes, columns= classes)

My results:
# Model Accuracy Table

| Model and dataset                                  | Train Size             | Accuracy  |
|--------------------------------------------------  |------------------------|-----------|
| Basic CNN, No augmentation                         | 3000                   | 0.609583  |
| Basic CNN, Dropout layers added                    | 3000                   | 0.625595  |
| Dropout layers added, train size increased         | 12000                  | 0.797213  |
| Same model as above, Data augmented (3000 -> 12000)| 12000                  | 0.749541  |
| Same model as above, whole  train dataset                                      | Full (~120000)         | 0.914313  |


Old Models

In [None]:
#CNN without Dropout layers. Dropout layers prevent overfitting. Adding dropout layers increase the performance.
class CNN(nn.Module):
    def __init__(self, num_classes):
        super(CNN, self).__init__()
        
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=(3, 3), stride=1, padding=1)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=(3, 3), stride=1, padding=1)
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=(3, 3), stride=1, padding=1)
        
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        
        self.fc1 = nn.Linear(128 * 16 * 10, 512)  # Adjust the input size based on the output of the last pooling layer
        self.fc2 = nn.Linear(512, num_classes)
    
    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = F.relu(self.conv3(x))
        x = self.pool(x)
        
        x = x.view(x.size(0), -1)  # Flatten the tensor for fully connected layers
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        
        return x
# Instantiate the model
model = CNN(num_classes=20)


In [None]:
#CNN with batch normalization. However, it did not increase the accuracy. In the end batch normalization is not used.
class CNNWithBatchNorm(nn.Module):
    def __init__(self, num_classes):
        super(CNNWithBatchNorm, self).__init__()
        
        # Convolutional layers with Batch Normalization
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=(3, 3), stride=1, padding=1)
        self.bn1 = nn.BatchNorm2d(32)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=(3, 3), stride=1, padding=1)
        self.bn2 = nn.BatchNorm2d(64)
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=(3, 3), stride=1, padding=1)
        self.bn3 = nn.BatchNorm2d(128)
        
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        
        self.fc1 = nn.Linear(128 * 16 * 10, 512)  # Adjust the input size based on the output of the last pooling layer
        self.fc2 = nn.Linear(512, num_classes)
    
    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = self.pool(x)
        x = F.relu(self.bn2(self.conv2(x)))
        x = self.pool(x)
        x = F.relu(self.bn3(self.conv3(x)))
        x = self.pool(x)
        
        x = x.view(x.size(0), -1)  # Flatten the tensor for fully connected layers
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        
        return x

# Instantiate the model
model_with_bn = CNNWithBatchNorm(num_classes=20)