In [67]:
import os
import re
import gc
import torch
import numpy as np
import pandas as pd
import torch.optim as optim
import torch.nn.functional as F

from torch import nn
from glob import glob
from scipy import signal
from scipy.io import wavfile
from scipy.fftpack import fft
from tqdm import tqdm_notebook
from torchvision import transforms
from torch.autograd import Variable
from sklearn.datasets.lfw import Bunch
from sklearn.metrics import precision_score
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

In [2]:
args = Bunch()

# Data

You will need:
* accept rules at https://www.kaggle.com/c/tensorflow-speech-recognition-challenge 
* install kaggle cli at https://github.com/Kaggle/kaggle-api

In [3]:
!kaggle competitions download -c tensorflow-speech-recognition-challenge

sample_submission.7z: Skipping, found more recently modified local copy (use --force to force download)
train.7z: Skipping, found more recently modified local copy (use --force to force download)
test.7z: Skipping, found more recently modified local copy (use --force to force download)
link_to_gcp_credits_form.txt: Skipping, found more recently modified local copy (use --force to force download)


Next, extract downloaded `train.7z` data (manually):

In [4]:
# !7z x ~/.kaggle/competitions/tensorflow-speech-recognition-challenge/train.7z

and prepare `X` and `y` vars:

In [5]:
L = 16000
legal_labels = 'yes no up down left right on off stop go silence unknown'.split()

# src folders
root_path = os.path.expanduser('~/.kaggle/competitions/tensorflow-speech-recognition-challenge')
out_path = r'.'
model_path = r'.'
train_data_path = os.path.join(root_path, 'train', 'audio')
test_data_path = os.path.join(root_path, 'test', 'audio')

In [6]:
def custom_fft(y, fs):
    T = 1.0 / fs
    N = y.shape[0]
    yf = fft(y)
    xf = np.linspace(0.0, 1.0/(2.0*T), N//2)
    # FFT is simmetrical, so we take just the first half
    # FFT is also complex, to we take just the real part (abs)
    vals = 2.0/N * np.abs(yf[0:N//2])
    return xf, vals


def log_specgram(audio, sample_rate, window_size=20,
                 step_size=10, eps=1e-10):
    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))
    freqs, times, spec = signal.spectrogram(audio,
                                    fs=sample_rate,
                                    window='hann',
                                    nperseg=nperseg,
                                    noverlap=noverlap,
                                    detrend=False)
    return freqs, times, np.log(spec.T.astype(np.float32) + eps)

In [7]:
def list_wavs_fname(dirpath, ext='wav'):
    fpaths = glob(os.path.join(dirpath, r'*/*' + ext))
    pat = r'.+/(\w+)/\w+\.' + ext + '$'
    labels = []
    for fpath in fpaths:
        r = re.match(pat, fpath)
        if r:
            labels.append(r.group(1))
    pat = r'.+/(\w+\.' + ext + ')$'
    fnames = []
    for fpath in fpaths:
        r = re.match(pat, fpath)
        if r:
            fnames.append(r.group(1))
    return labels, fnames

In [8]:
def pad_audio(samples):
    if len(samples) >= L: return samples
    else: return np.pad(samples, pad_width=(L - len(samples), 0), mode='constant', constant_values=(0, 0))


def chop_audio(samples, L=16000, num=20):
    for i in range(num):
        beg = np.random.randint(0, len(samples) - L)
        yield samples[beg: beg + L]


def label_transform(labels):
    nlabels = []
    for label in labels:
        if label == '_background_noise_':
            nlabels.append('silence')
        elif label not in legal_labels:
            nlabels.append('unknown')
        else:
            nlabels.append(label)
    return pd.get_dummies(pd.Series(nlabels))

In [9]:
%%time
%%capture
labels, fnames = list_wavs_fname(train_data_path)

new_sample_rate = 8000
y = []
X = []

for label, fname in zip(labels, fnames):
    sample_rate, samples = wavfile.read(os.path.join(train_data_path, label, fname))
    samples = pad_audio(samples)
    if len(samples) > 16000:
        n_samples = chop_audio(samples)
    else: n_samples = [samples]
    for samples in n_samples:
        resampled = signal.resample(samples, int(new_sample_rate / sample_rate * samples.shape[0]))
        _, _, specgram = log_specgram(resampled, sample_rate=new_sample_rate)
        y.append(label)
        X.append(specgram)
X = np.array(X)
X = X.reshape(tuple(list(X.shape) + [1]))
y = label_transform(y)
label_index = y.columns.values
y = y.values
y = np.array(y)
del labels, fnames
gc.collect()

CPU times: user 2min 11s, sys: 13.1 s, total: 2min 24s
Wall time: 4min 14s


In [10]:
X.shape, y.shape  # ((N, H, W, C), (N, Y))

((64841, 99, 81, 1), (64841, 12))

# Model

In [160]:
args.data_up = 10000  # subsample to speed-up whole process
args.batch_size = 32
args.n_epoch = 5
args.n_classes = y.shape[1]

In [161]:
class FromNumpyDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    
    def __len__(self):
        return self.X.shape[0]
    
    def __getitem__(self, idx):
        return {
            'X': torch.from_numpy(self.X[idx].transpose((2, 0, 1))).type(torch.FloatTensor),  # (C, H, W)
            'y': self.y[idx].argmax()  # Y
        }

In [162]:
N = X.shape[0]
assert args.data_up <= N
inds = np.random.choice(N, args.data_up)
X_train, X_val, y_train, y_val = train_test_split(X[inds, ...], y[inds, ...])
train, val = FromNumpyDataset(X_train, y_train), FromNumpyDataset(X_val, y_val)
train = DataLoader(train, batch_size=args.batch_size, shuffle=True, num_workers=2, drop_last=True)
val = DataLoader(val, batch_size=args.batch_size, shuffle=True, num_workers=2, drop_last=True)

In [163]:
class SimpleCNN(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.cnn = nn.Sequential(
            nn.Sequential(
                nn.Conv2d(1, 8, 2),
                nn.ReLU(),
                nn.Conv2d(8, 8, 2),
                nn.ReLU(),
                nn.MaxPool2d(2),
                nn.Dropout2d(0.2)
            ),
            nn.Sequential(
                nn.Conv2d(8, 16, 3),
                nn.ReLU(),
                nn.Conv2d(16, 16, 3),
                nn.ReLU(),
                nn.MaxPool2d(2),
                nn.Dropout2d(0.2)
            ),
            nn.Sequential(
                nn.Conv2d(16, 32, 3),
                nn.ReLU(),
                nn.MaxPool2d(2),
                nn.Dropout2d(0.2)
            )
        )
        
        self.fc = nn.Sequential(
            nn.Linear(2240, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Linear(128, args.n_classes)
        )
    
    def forward(self, x):
        # x.shape = (N=batch_size, C=1, H, W)
        x = self.cnn(x)
        x = x.view(x.size(0), -1)
        return self.fc(x)

# Train

In [164]:
gc.collect()
model = SimpleCNN()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(args.n_epoch):
    total_loss = 0
    for batch in tqdm_notebook(train, desc='i_batch'):
        X_batch = Variable(batch['X'])
        y_true = Variable(batch['y'])

        optimizer.zero_grad()
        y_pred = model(X_batch)
        loss = criterion(y_pred, y_true)
        loss.backward()
        optimizer.step()

        total_loss += loss.data[0]
        
    gc.collect()
    total_loss /= (args.batch_size * len(train))
    print(f'epoch={epoch}/{args.n_epoch} loss={total_loss}')


epoch=0/5 loss=0.05521874759409927



epoch=1/5 loss=0.040164618275295466



epoch=2/5 loss=0.03461450045434838



epoch=3/5 loss=0.02834505986613341



epoch=4/5 loss=0.02384848308025135


# Val

In [165]:
def pred_to_y(t):
    return t.max(1)[1].numpy().flatten().astype(int)


gc.collect()
model.eval()
correct, total = 0, 0
for batch in tqdm_notebook(val, desc='i_batch'):
    X_batch = Variable(batch['X'])
    y_true = batch['y'].numpy()
    y_pred = model(X_batch).data.max(1)[1]
    correct += (y_true == y_pred).sum()
    total += len(y_true)

correct / total




0.7832532051282052