In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import warnings; warnings.filterwarnings('ignore')

import os
from tqdm import tqdm_notebook
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import wavfile
from python_speech_features import mfcc, logfbank
import librosa
from pathlib import Path
from collections import Counter
from scipy.signal import resample

import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader

from sklearn.metrics import roc_auc_score
from sklearn.utils.class_weight import compute_class_weight

import pickle

from plotting_functions import *

In [3]:
class Config:
    def __init__(self, mode='conv', nfilt=26, nfft=520, ncoef=13, rate=16_000, audio_size = .10):
        self.mode = mode
        self.nfilt = nfilt
        self.nfft = nfft
        self.ncoef = ncoef
        self.rate = rate
        self.audio_size = audio_size
        self.step = int(rate*self.audio_size)
        
config = Config(audio_size = .5)

In [4]:
PATH = Path('./')
TRAIN_FILES = Path('clean')

In [5]:
base_df = pd.read_csv(PATH/'instruments.csv')
base_df.head()

Unnamed: 0,fname,label
0,5388d14d.wav,Saxophone
1,c685f05f.wav,Saxophone
2,36d20ab5.wav,Saxophone
3,d6665734.wav,Saxophone
4,7352e28f.wav,Saxophone


In [6]:
base_df.set_index('fname', inplace=True)

In [7]:
base_df.label.value_counts() # uniform

Hi-hat              30
Cello               30
Acoustic_guitar     30
Double_bass         30
Saxophone           30
Bass_drum           30
Snare_drum          30
Violin_or_fiddle    30
Clarinet            30
Flute               30
Name: label, dtype: int64

In [8]:
def compute_lengths(df):
    for f in df.index:
        rate, signal = wavfile.read(TRAIN_FILES/f)
        df.at[f, 'length'] = signal.shape[0]/rate

compute_lengths(base_df)

# filter out too shot audios
base_df = base_df[base_df.length>=config.audio_size]

In [9]:
# train/val split

idxs = np.random.rand(len(base_df))<.15

df = base_df[~idxs]
valid_df = base_df[idxs]

In [10]:
classes = df.label.unique().tolist()

In [11]:
len(df),len(valid_df)

(236, 50)

# Creating training samples

Unlike most other kinds of data & tasks audio data can be augmented for training. A single audio file can represent more than one sample for training. To create those samples we can samples parts of audio from the waves files. For instance one could sample 10ths of a second from a specific file.

To keep the training distribution we are interested in preserving the class distribution in our data.

In [12]:
df.length.describe()

count    236.000000
mean       4.700635
std        3.446258
min        0.518188
25%        2.415656
50%        3.797531
75%        6.253625
max       19.252437
Name: length, dtype: float64

In [13]:
valid_df.length.describe()

count    50.000000
mean      4.109670
std       3.389165
min       0.502687
25%       1.545000
50%       3.159063
75%       5.977469
max      14.263312
Name: length, dtype: float64

In [14]:
# classes distribution
classes = df.label.unique().tolist()
class_dist = df.groupby(['label'])['length'].mean() # != approach with Counter ? 
prob_dist = class_dist/class_dist.sum()

# sampling parameters
n_samples = 2 * int(df.length.sum()/config.audio_size)
choices = np.random.choice(class_dist.index, size=n_samples, p=prob_dist)

# filter out audios which len < audio_size
df = df[df.length > config.audio_size]
print(f'Remaining audio files: {len(df)}')

Remaining audio files: 236


In [15]:
def build_rand_feat(n_samples, choices, df, mode='train'):
    X = []
    y = []
    _min, _max = float('inf'), -float('inf')
    for i in tqdm_notebook(range(n_samples)):
        file = np.random.choice(df[df.label == choices[i]].index)
        rate, wav = wavfile.read(TRAIN_FILES/file)
#         print(wav.shape)
        rand_start = np.random.randint(0,wav.shape[0] - config.step)
        sample = wav[rand_start:rand_start+config.step]
        X_sample = mfcc(sample, rate, 
                        numcep=config.ncoef, nfft=config.nfft, nfilt=config.nfilt).T
        
        _min = min(_min, np.amin(X_sample))
        _max = max(_max, np.amax(X_sample))
        
        X.append(X_sample)
        y.append(classes.index(choices[i]))
    
    if mode=='train':
        config._min, config._max = _min, _max
    else:
        _min, _max = config._min, config._max
        
    X, y = np.array(X), np.array(y)
    # normalize
    X = (X - _min)/(_max - _min)
    
    if config.mode == 'conv': X = np.expand_dims(X, axis=1)
        
    return X, y

In [16]:
X, y = build_rand_feat(n_samples, choices, df)

HBox(children=(IntProgress(value=0, max=4436), HTML(value='')))




In [17]:
config.step

8000

In [18]:
X_val, y_val = build_rand_feat(n_samples, choices, valid_df, mode='val')

HBox(children=(IntProgress(value=0, max=4436), HTML(value='')))




In [19]:
class_weights = compute_class_weight('balanced', np.unique(y), y)
class_weights

array([0.66506747, 0.93389474, 1.4640264 , 1.34018127, 0.76351119,
       1.64907063, 1.01743119, 3.14609929, 0.62478873, 0.84818356])

In [20]:
X.shape, y.shape

((4436, 1, 13, 49), (4436,))

In [21]:
# save configuration & data

def save_data(runs_dir = 'runs', nrun=None):
    if nrun:
        run_dir = os.path.join(runs_dir,nrun)
    else:
        nrun = str(len(os.listdir(runs_dir)))
        run_dir = os.path.join(runs_dir,nrun)
        os.mkdir(run_dir)
    
    objs = [X, y, config]
    extensions = ['X.p', 'y.p', 'config.p']

    to_save = zip(objs, extensions)

    for obj, ext in to_save:
        p = os.path.join(run_dir, ext)
        with open(p, 'wb') as f:
            pickle.dump(obj, f, protocol=2)
    
# save_data(nrun=0)

# Data set & Data Loader

In [22]:
class AudioDataset(Dataset):
    
    def __init__(self, X:np.ndarray, y:np.ndarray):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.int64)
        
    def __len__(self):
        return self.X.shape[0]
        
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [23]:
dl = DataLoader(AudioDataset(X, y), 
                batch_size=64)

valid_dl = DataLoader(AudioDataset(X_val, y_val),
                batch_size=64)

# Model

In [24]:
class cnn(nn.Module):
    
    def __init__(self):
        super().__init__()
        cnn_layers = []

        cnn_layers.append(nn.Conv2d(1, 16, (7,7), stride = 1, padding = 3))
        cnn_layers.append(nn.Conv2d(16, 32, (5,5), stride = 1, padding = 2))
        cnn_layers.append(nn.Conv2d(32, 64, (3,3), stride = 1, padding = 0))
        cnn_layers.append(nn.Conv2d(64, 128, (3,3), stride = 1, padding = 0))
        
        self.cnn = nn.Sequential(*cnn_layers)
        
        self.pool = nn.AdaptiveAvgPool2d((1))
        
        linear_layers = nn.ModuleList()
        
        linear_layers.append(nn.Linear(128, 64))
        linear_layers.append(nn.ReLU())
        linear_layers.append(nn.Linear(64, 10))
        
        self.linear_layers = nn.Sequential(*linear_layers)

    def forward(self, x):
        
        out = self.cnn(x)
        
        out = self.pool(out)
        
        out = out.squeeze()
        
        out = self.linear_layers(out)
        
        return out

In [25]:
model = cnn().cuda()

# Training

In [26]:
def one_hot_like(outs:np.ndarray, ys:np.ndarray):
    y_oh = np.zeros(outs.shape, dtype=np.int64)
    y_oh[range(outs.shape[0]),ys] = 1
    return y_oh

def avg_auc(outs:np.ndarray, ys:np.ndarray, weights=None):
    outs = np.vstack(outs)
    y_oh = one_hot_like(outs, ys)
    aucs = []
    for i in range(outs.shape[1]):
        aucs.append(roc_auc_score(y_oh[:,i], outs[:,i]))
    aucs = np.array(aucs)
    return aucs.mean() if weights is None else (aucs*weights).sum()


def validation(model, dl:torch.utils.data.DataLoader, weights=None):
    model.eval()
    outs, ys = [],[]
    with torch.no_grad():
        for X, y in dl:
            X, y = X.cuda(), y.cuda()
            
            out = model(X)
            
            outs.append(out.cpu().detach().numpy())
            ys.extend(y.cpu().numpy())

    return avg_auc(outs, ys, weights)
    

def train(model, epochs:int, train_dl:torch.utils.data.DataLoader, valid_dl:torch.utils.data.DataLoader=None, lr:float=3*1e-3, weights=None):
    optim = torch.optim.Adam(model.parameters(), lr=lr)
    t_weights = None if weights is None else torch.tensor(weights).float().cuda()
    
    for epoch in tqdm_notebook(range(epochs)):
        model.train()
        agg_loss, n_obs = 0, 0
        outs, ys = [],[]
        for X, y in train_dl:
            X, y = X.cuda(), y.cuda()
            
            out = model(X)
            
            optim.zero_grad()
            loss = F.cross_entropy(out, y, weight=t_weights)
            
            agg_loss += loss.item()*y.shape[0]
            n_obs += y.shape[0]
            
            loss.backward()
            optim.step()
            
            outs.append(out.cpu().detach().numpy())
            ys.extend(y.cpu().numpy())
            
        tmetric = avg_auc(outs, ys)

        log = f'Epoch {epoch+1}: train loss {agg_loss/n_obs:.4f} avg auc {tmetric:.4f}'
        
        if valid_dl:
            vmetric = validation(model, valid_dl)
            print(log + f' val. avg auc {vmetric:.4f}')
        else: 
            print(log)

In [27]:
train(model, 5, dl, valid_dl, weights=class_weights)

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

Epoch 1: train loss 2.2364 avg auc 0.5911 val. avg auc 0.6817
Epoch 2: train loss 1.8540 avg auc 0.7303 val. avg auc 0.7430
Epoch 3: train loss 1.6641 avg auc 0.7699 val. avg auc 0.7525
Epoch 4: train loss 1.4456 avg auc 0.8095 val. avg auc 0.7665
Epoch 5: train loss 1.2026 avg auc 0.8525 val. avg auc 0.7991



In [28]:
train(model, 3, dl, valid_dl, lr=1*1e-3)

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

Epoch 1: train loss 1.0738 avg auc 0.8865 val. avg auc 0.8004
Epoch 2: train loss 0.9421 avg auc 0.8974 val. avg auc 0.8063
Epoch 3: train loss 0.8448 avg auc 0.9044 val. avg auc 0.8069

