# Setup

In [1]:
from math import floor, ceil
from multiprocessing import Pool, cpu_count
from pathlib import Path
from python_speech_features import logfbank
from python_speech_features import mfcc
from scipy.io import wavfile
from time import time
import glob
import hashlib
import numpy as np
import os
import pickle
import random
import re
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

USE_CUDA = torch.cuda.is_available()
MAX_NUM_WAVS_PER_CLASS = 2**27 - 1  # ~134M
SAMPLE_RATE = 16000
MFCC_SIZE = 5000
BATCH_SIZE = 10000
ALL_LABELS = ['yes', 'no', 'up', 'down', 'left', 'right',
              'on', 'off', 'stop', 'go', 'unknown']
LABEL_MAPPING = {name:i for i, name in enumerate(ALL_LABELS)}

# Data Import

In [2]:
def which_set(filename, validation_percentage, testing_percentage):
    """Determines which data partition the file should belong to.

    We want to keep files in the same training, validation, or testing sets even
    if new ones are added over time. This makes it less likely that testing
    samples will accidentally be reused in training when long runs are restarted
    for example. To keep this stability, a hash of the filename is taken and used
    to determine which set it should belong to. This determination only depends on
    the name and the set proportions, so it won't change as other files are added.

    It's also useful to associate particular files as related (for example words
    spoken by the same person), so anything after '_nohash_' in a filename is
    ignored for set determination. This ensures that 'bobby_nohash_0.wav' and
    'bobby_nohash_1.wav' are always in the same set, for example.

    Args:
        filename: File path of the data sample.
        validation_percentage: How much of the data set to use for validation.
        testing_percentage: How much of the data set to use for testing.

    Returns:
        String, one of 'training', 'validation', or 'testing'.
    """
    base_name = os.path.basename(filename)
    
    # ignore anything after '_nohash_' in the file name
    hash_name = re.sub(r'_nohash_.*$', '', base_name)

    # hash(filename) -> value to split into training/testing/validation
    hash_name_hashed = hashlib.sha1(str.encode(hash_name)).hexdigest()
    percentage_hash = ((int(hash_name_hashed, 16) %
                      (MAX_NUM_WAVS_PER_CLASS + 1)) *
                     (100.0 / MAX_NUM_WAVS_PER_CLASS))

    if percentage_hash < validation_percentage:
        result = 'validation'
    elif percentage_hash < (testing_percentage + validation_percentage):
        result = 'testing'
    else:
        result = 'training'
    return result

def convert_label_to_id(label):
    """Convert label to its ID for prediction."""
    if label not in ALL_LABELS:
        label = 'unknown'
    return LABEL_MAPPING[label]

def pad_sound_to_one_second(data):
    if data.shape[0] != SAMPLE_RATE:
        padding_needed = SAMPLE_RATE - data.shape[0]
        front_padding = padding_needed // 2
        end_padding = padding_needed - front_padding
        data = np.concatenate(([0]*front_padding, data, [0]*end_padding))
    return data

In [3]:
path_prefix = "Data/train/audio/"

# Split data into 80% training, 10% validation, 10% testing
validation_percentage = 10
testing_percentage = 10
datasets = {'validation': [], 'testing': [], 'training': []}

for filename in glob.glob(path_prefix + "*/*.wav"):
    _, _, _, label, sound_filename = filename.split("/")
    
    if label == "_background_noise_":
        continue
    
    dataset_name = which_set(sound_filename,
                             validation_percentage,
                             testing_percentage)
    
    # List[(label, label_id, sound_filename), ...]
    datasets[dataset_name].append((label,
                                   convert_label_to_id(label),
                                   sound_filename))


for name, labelled_sounds in datasets.items():
    print("{:>10} count: {}".format(name, len(labelled_sounds)))
    
# Shuffle training data to improve performance
# random.shuffle(datasets['training'])

print("\n{} training labels: {}".format(len(ALL_LABELS), ALL_LABELS))

validation count: 6798
   testing count: 6835
  training count: 51088

11 training labels: ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go', 'unknown']


In [4]:
def import_dataset_to_torch(name, progress=False):
    label_ids = []
    samples = []
    count = 0
    training_count = len(datasets[name])

    for label, label_id, filename in datasets[name]:
        full_path = path_prefix + label + "/" + filename
        sample_rate, data = wavfile.read(full_path)
        data = pad_sound_to_one_second(data).astype(np.int16)

        assert sample_rate == SAMPLE_RATE
        assert data.shape[0] == SAMPLE_RATE

        label_ids.append(torch.LongTensor([label_id]))
        samples.append(torch.from_numpy(data))

        if progress and len(label_ids) == training_count:
            print("Import Progress for {}: {:.1f}%".format(name, 100*len(label_ids)/training_count))

    samples = torch.stack(samples)
    samples = samples.type(torch.float)
    label_ids = torch.cat(label_ids)
    return samples, label_ids

train_samples, train_label_ids = import_dataset_to_torch('training', progress=True)
validation_samples, validation_label_ids = import_dataset_to_torch('validation', progress=True)
test_samples, test_label_ids = import_dataset_to_torch('testing', progress=True)

Import Progress for training: 100.0%
Import Progress for validation: 100.0%
Import Progress for testing: 100.0%


# Feature Engineering

In [5]:
def mel_frequency_cepstral_coefficients(file_samples):
    """Converts wav samples into MFCC coefficients.
    
    :return: numpy array of shape (num_frames, num_cep)"""
    mfcc_feat = mfcc(file_samples, SAMPLE_RATE, winlen=0.01, numcep=50, nfilt=50)
    fbank_feat = logfbank(file_samples, SAMPLE_RATE, winlen=0.01, nfilt=50).flatten()
    return fbank_feat

def parallel_mfcc(samples):
    # torch.Tensor doesn't seem to be thread-safe, so we have to pass to numpy and back
    with Pool(processes=cpu_count()) as pool:
        train_mfcc = pool.map(mel_frequency_cepstral_coefficients, (row.numpy() for row in samples))
    return [torch.from_numpy(row).type(torch.float) for row in train_mfcc]

In [6]:
try:
    # If we've already computed these, retrieve them from disk
    train_mfcc, train_id_labels = pickle.load(open("Data/train_mfcc.p", "rb"))
    validation_mfcc, train_id_labels = pickle.load(open("Data/validation_mfcc.p", "rb"))
    test_mfcc, train_id_labels = pickle.load(open("Data/testing_mfcc.p", "rb"))
    print("Retrieved MFCCs from disk")
    
    if 'train_samples' in vars() or 'train_samples' in globals():
        del train_samples
except FileNotFoundError:
    print("MFCCs not found on disk, computing...")
    
    # Otherwise, compute MFCCs and store them on disk
    start = time()
    train_mfcc = torch.stack(parallel_mfcc(train_samples))
    print("Training data MFCC took {:.1f} s".format(time() - start))

    start = time()
    validation_mfcc = torch.stack(parallel_mfcc(validation_samples))
    test_mfcc = torch.stack(parallel_mfcc(test_samples))
    print("Validation and testing MFCC took {:.1f} s".format(time()-start))
    
    pickle.dump((train_mfcc, train_label_ids),
                open("Data/train_mfcc.p", "wb"))
    pickle.dump((validation_mfcc, validation_label_ids),
                open("Data/validation_mfcc.p", "wb"))
    pickle.dump((test_mfcc, test_label_ids),
                open("Data/testing_mfcc.p", "wb"))

Retrieved MFCCs from disk


# Neural Network Architecture

In [7]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # fully connected layers:
        #   5000->50->50->11
        self.fc1 = nn.Linear(MFCC_SIZE, 50)
        self.fc2 = nn.Linear(50, 50)
        self.fc3 = nn.Linear(50, len(ALL_LABELS))
        self.dropout = nn.Dropout(p=0.1)
#         torch.nn.init.xavier_uniform_(self.fc1.weight)
#         torch.nn.init.xavier_uniform_(self.fc2.weight)
#         torch.nn.init.xavier_uniform_(self.fc3.weight)

    def forward(self, x):
        x = self.dropout(F.leaky_relu(self.fc1(x)))
        x = self.dropout(F.leaky_relu(self.fc2(x)))
        x = self.dropout(F.leaky_relu(self.fc3(x)))
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

In [8]:
def accuracy(name, print_results=False):
    # Check classification accuracy on the validation/testing set
    correct = 0
    total = 0
    
    samples, label_ids = None, None
    if name == 'training':
        samples = train_mfcc
        label_ids = train_label_ids
    elif name == 'validation':
        samples = validation_mfcc
        label_ids = validation_label_ids
    elif name == 'testing':
        samples = test_mfcc
        label_ids = test_label_ids
    else:
        assert False, "{} not supported in accuracy".format(name)
    
    with torch.no_grad():
        outputs = net(samples)
        _, predicted = torch.max(outputs.data, 1)
        total += label_ids.size()[0]
        correct += (predicted == label_ids).sum().item()
    
    percentage_correct = 100*correct/total
    
    if print_results:
        # Random guessing here is 1/12 ~ 8.3%
        print("Accuracy of the network on {} {} sound clips: {:.1f}%"
              "".format(len(samples), name, percentage_correct))
    
    return percentage_correct

In [9]:
# net = Net()
net = torch.load("Data/network_state")
net.eval()
print(net)

Net(
  (fc1): Linear(in_features=5000, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=50, bias=True)
  (fc3): Linear(in_features=50, out_features=11, bias=True)
  (dropout): Dropout(p=0.1)
)


In [10]:
# Correct for class sizes in loss function
train_class_weights = [1/(train_label_ids == label).sum().item() for label in range(len(ALL_LABELS))]
train_class_weights = torch.Tensor(train_class_weights)

if USE_CUDA:
    device = torch.device("cuda:0" if USE_CUDA else "cpu")
    net.to(device)
    train_class_weights = train_class_weights.cuda()
    train_class_weights.to(device)

# Classification Cross-Entropy and RMSprop
criterion = nn.CrossEntropyLoss(weight=train_class_weights)
optimizer = optim.RMSprop(net.parameters(), lr=1e-8)

In [11]:
last_validation_accuracy = 0
start = time()

for epoch in range(500): # loop over the dataset multiple times  
    current_validation_accuracy = 0
    running_loss = 0.0
    
    for batch_number, i in enumerate(range(0, len(train_mfcc), BATCH_SIZE)):
        # get the inputs
        inputs, labels = train_mfcc[i:i+BATCH_SIZE], \
                         train_label_ids[i:i+BATCH_SIZE]
        
        if USE_CUDA:
            inputs, labels = inputs.to(device), labels.to(device)
        
        # zero the parameter gradients
        optimizer.zero_grad()
        
        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
#         if batch_number % 5000 == 4999: # print every 5K mini-batches
#             print('[%d, %5d] loss: %.3f' %
#                   (epoch + 1, batch_number + 1, running_loss / 5000))
#             running_loss = 0.0
    
    
    if epoch % 100 == 0:
        if USE_CUDA:
            net.to("cpu")
        
        with torch.no_grad():
            current_training_accuracy = accuracy('training')
            current_validation_accuracy = accuracy('validation')
        print("Epoch {:>4}: loss: {:>10.5f}, "
              "Training accuracy: {:>5.1f}%, "
              "Validation accuracy: {:>5.1f}%, "
              "".format(epoch + 1, running_loss, current_training_accuracy, current_validation_accuracy))
        
#         if current_validation_accuracy < 60 or current_validation_accuracy > 0.9*last_validation_accuracy:
#             last_validation_accuracy = current_validation_accuracy
#         else:
#             break
    
        if USE_CUDA:
            net.to(device)
        
        # Shuffle training data
#         idx = torch.randperm(train_mfcc.size()[0])
#         train_mfcc = train_mfcc[idx]
#         train_label_ids = train_label_ids[idx]

print('Finished Training in {:.2f} s'.format(time()-start))

if USE_CUDA:
    net.to("cpu")

Epoch    1: loss:    0.40431, Training accuracy:  95.7%, Validation accuracy:  81.1%, 
Epoch  101: loss:    0.40366, Training accuracy:  95.7%, Validation accuracy:  81.0%, 
Epoch  201: loss:    0.40339, Training accuracy:  95.7%, Validation accuracy:  81.1%, 
Epoch  301: loss:    0.40320, Training accuracy:  95.7%, Validation accuracy:  81.1%, 
Epoch  401: loss:    0.40306, Training accuracy:  95.8%, Validation accuracy:  81.1%, 
Finished Training in 54.03 s


In [12]:
# Check final validation and test accuracies
_ = accuracy('training', print_results=True)
_ = accuracy('validation', print_results=True)
_ = accuracy('testing', print_results=True)

Accuracy of the network on 51088 training sound clips: 95.8%
Accuracy of the network on 6798 validation sound clips: 81.1%
Accuracy of the network on 6835 testing sound clips: 80.8%


In [13]:
def print_confusion_matrix(inputs, ground_truth_labels):
    """Prints normalized confusion matrix with ground_truth columns and prediction rows."""
    confusion_matrix = np.zeros((len(ALL_LABELS), len(ALL_LABELS)), dtype=np.int)

    outputs = net(inputs)
    _, predicted_labels = torch.max(outputs, 1)
    for ground_truth, prediction in zip(ground_truth_labels, predicted_labels):
        confusion_matrix[prediction, ground_truth] += 1

    np.set_printoptions(precision=2, suppress=True)
    confusion_matrix = confusion_matrix.astype(np.float)
    
    for i in range(len(ALL_LABELS)):
        confusion_matrix[:, i] = confusion_matrix[:, i] / (ground_truth_labels == i).sum().item()
    
    for i in range(len(ALL_LABELS)):
        print(confusion_matrix[i, :])

print("Validation set confusion matrix")
print_confusion_matrix(validation_mfcc, validation_label_ids)

print("\n\nTest set confusion matrix")
print_confusion_matrix(test_mfcc, test_label_ids)

Validation set confusion matrix
[0.84 0.   0.   0.   0.05 0.   0.   0.   0.   0.   0.  ]
[0.   0.64 0.01 0.05 0.03 0.   0.01 0.   0.   0.14 0.01]
[0.   0.01 0.68 0.   0.01 0.01 0.03 0.06 0.03 0.03 0.01]
[0.01 0.01 0.   0.61 0.   0.   0.01 0.   0.   0.07 0.02]
[0.03 0.01 0.   0.   0.69 0.04 0.   0.02 0.   0.   0.01]
[0.   0.01 0.   0.   0.04 0.69 0.   0.02 0.   0.   0.01]
[0.   0.01 0.01 0.02 0.   0.   0.63 0.01 0.   0.01 0.02]
[0.   0.   0.07 0.   0.04 0.   0.03 0.76 0.   0.01 0.01]
[0.   0.01 0.02 0.   0.   0.   0.   0.   0.8  0.01 0.01]
[0.   0.12 0.03 0.04 0.   0.   0.01 0.   0.01 0.53 0.02]
[0.12 0.17 0.18 0.26 0.14 0.25 0.28 0.13 0.15 0.2  0.89]


Test set confusion matrix
[0.82 0.   0.   0.   0.04 0.01 0.   0.   0.   0.   0.01]
[0.   0.57 0.   0.06 0.01 0.   0.   0.   0.01 0.14 0.01]
[0.01 0.02 0.67 0.   0.01 0.   0.01 0.04 0.04 0.04 0.01]
[0.01 0.02 0.   0.58 0.   0.01 0.   0.   0.   0.02 0.02]
[0.05 0.01 0.01 0.   0.64 0.02 0.   0.01 0.   0.02 0.01]
[0.01 0.   0.   0.   0.02 0.

In [14]:
# torch.save(net, "Data/network_state")