In [1]:
import torch
from torchaudio_contrib import Melspectrogram
from torch import nn
import torch.nn.functional as F
from torch.optim.lr_scheduler import StepLR
import datetime
import math

from sklearn.metrics import accuracy_score

import librosa

import pandas as pd
import numpy as np

from pathlib import Path

import json

from net_config.audio import MelspectrogramStretch

from torch.utils.data import DataLoader
from torch.utils.data.dataset import Dataset

from tqdm import tqdm_notebook as tqdm

import logging


from visualization import WriterTensorboardX

from transforms import AudioTransforms


In [2]:
import os, errno


def mkdir_p(path):
    try:
        os.makedirs(path)
    except OSError as exc:  # Python >2.5
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else:
            raise


def setup_logging(logging_path='logs'):

    log_path = os.path.join(os.getcwd(),logging_path)
    mkdir_p(log_path)

    check_names = lambda y: y if y.isdigit() else -1
    get_ind = lambda x: int(check_names(x.split('_')[1]))
    
    run_counter = max(map(get_ind, os.listdir(log_path)), default=-1) + 1

    run_path = os.path.join(log_path, 'run_%s'%run_counter)
    mkdir_p(run_path)

    print('Logging set up, to monitor training run:\n'
        '\t\'tensorboard --logdir=%s\'\n'%run_path)

    return run_path


def list_dir(path):
    filter_dir = lambda x: os.path.isdir(os.path.join(path,x))
    filter_file = lambda x: os.path.isfile(os.path.join(path,x)) and not x.startswith('.') \
    and not x.split('.')[-1] in ['pyc', 'py','txt']

    ret = [n for n in os.listdir(path) if filter_dir(n) or filter_file(n)]
    
    return ret

In [3]:
class SoundSet(Dataset):
    def __init__(self, transform=None, mode="train"):
        # setting directories for data
        self.mode = mode
        self.all_files = list(Path("data/processed").glob('**/*.wav'))
        if self.mode is "train":
            self.files = [file for i, file in enumerate(self.all_files) if i%5 !=0]
        elif self.mode is "test":
            self.files = [file for i, file in enumerate(self.all_files) if i%5 ==0]
        with open(str(Path("data/processed") / 'labels.json')) as f:
            self.classes = json.load(f)
        # dict for mapping class names into indices. can be obtained by 
        # {cls_name:i for i, cls_name in enumerate(csv_file["label"].unique())}
        #self.classes = {'Acoustic_guitar': 38, 'Applause': 37, 'Bark': 19, 'Bass_drum': 21, 'Burping_or_eructation': 28, 'Bus': 22, 'Cello': 4, 'Chime': 20, 'Clarinet': 7,'Computer_keyboard': 8, 'Cough': 17, 'Cowbell': 33, 'Double_bass': 29, 'Drawer_open_or_close': 36, 'Electric_piano': 34, 'Fart': 14, 'Finger_snapping': 40, 'Fireworks': 31, 'Flute': 16, 'Glockenspiel': 3, 'Gong': 26, 'Gunshot_or_gunfire': 6, 'Harmonica': 25, 'Hi-hat': 0, 'Keys_jangling': 9, 'Knock': 5, 'Laughter': 12, 'Meow': 35, 'Microwave_oven': 27, 'Oboe': 15, 'Saxophone': 1, 'Scissors': 24, 'Shatter': 30, 'Snare_drum': 10, 'Squeak': 23, 'Tambourine': 32, 'Tearing': 13, 'Telephone': 18, 'Trumpet': 2, 'Violin_or_fiddle': 39,  'Writing': 11}
        self.transform = transform
        
    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        filename = self.files[idx]
        
        data, sr = librosa.load(str(filename))
        data = data.reshape(-1, 1)   
        
        if self.transform is not None:
            data = self.transform.apply(data)

        #if self.mode is "train":
        label = self.classes[filename.stem]
        return  data, sr, label

#         elif self.mode is "test":
#             return torch.from_numpy(data).float(), sr

In [4]:
def pad_seq(batch):
        sort_ind = 0
        sorted_batch = sorted(batch, key=lambda x: x[0].size(sort_ind), reverse=True)
        seqs, srs, labels = zip(*sorted_batch)
        
        lengths, srs, labels = map(torch.LongTensor, [[x.size(sort_ind) for x in seqs], srs, labels])

        # seqs_pad -> (batch, time, channel) 
        seqs_pad = torch.nn.utils.rnn.pad_sequence(seqs, batch_first=True, padding_value=0)
        #seqs_pad = seqs_pad_t.transpose(0,1)
        return seqs_pad, lengths, srs, labels

In [5]:
batch_size = 64
dataloader = DataLoader(
        SoundSet(mode="train", transform=AudioTransforms("train", {"noise":[0.3, 0.001], "crop":[0.4, 0.25]})),
        batch_size=batch_size,
        shuffle=True, 
        num_workers=0,
        collate_fn=pad_seq)
test_dataloader = DataLoader(
        SoundSet(mode="test", transform=AudioTransforms("train", {"noise":[0.3, 0.001], "crop":[0.4, 0.25]})),
        batch_size=batch_size,
        shuffle=True, 
        num_workers=0,
        collate_fn=pad_seq)

In [6]:
class AudioCRNN(nn.Module):
    def __init__(self, classes, state_dict=None):
        super(AudioCRNN, self).__init__()
        
        in_chan = 1

        self.classes = classes
        self.lstm_units = 64
        self.lstm_layers = 2
        self.spec = MelspectrogramStretch(hop_length=None, 
                                num_mels=128, 
                                fft_length=2048, 
                                norm='whiten', 
                                stretch_param=[0.4, 0.4])

        # shape -> (channel, freq, token_time)
        self.net = nn.ModuleDict({
            'convs' : nn.Sequential(
                nn.Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=[0, 0]),
                nn.BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
                nn.ELU(alpha=1.0),
                nn.MaxPool2d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False),
                nn.Dropout(p=0.1),
                nn.Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=[0, 0]),
                nn.BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
                nn.ELU(alpha=1.0),
                nn.MaxPool2d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False),
                nn.Dropout(p=0.1),
                nn.Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=[0, 0]),
                nn.BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
                nn.ELU(alpha=1.0),
                nn.MaxPool2d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False),
                nn.Dropout(p=0.1)
            ),
            'recur' : nn.LSTM(128, 64, num_layers=2),
            'dense' : nn.Sequential(
                nn.Dropout(p=0.3),
                nn.BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
                nn.Linear(in_features=64, out_features=4, bias=True)
            )
        })
        #self.net = parse_cfg(config['cfg'], in_shape=[in_chan, self.spec.num_mels, 400])

    def _many_to_one(self, t, lengths):
        return t[torch.arange(t.size(0)), lengths - 1]

    def modify_lengths(self, lengths):
        def safe_param(elem):
            return elem if isinstance(elem, int) else elem[0]
        
        for name, layer in self.net['convs'].named_children():
            #if name.startswith(('conv2d','maxpool2d')):
            if isinstance(layer, (nn.Conv2d, nn.MaxPool2d)):
                p, k, s = map(safe_param, [layer.padding, layer.kernel_size,layer.stride]) 
                lengths = (lengths + 2*p - k)//s + 1

        return torch.where(lengths > 0, lengths, torch.tensor(1, device=lengths.device))

    def forward(self, batch):    
        # x-> (batch, time, channel)
        #print(batch)
        x, lengths, _ = batch # unpacking seqs, lengths and srs

        # x-> (batch, channel, time)
        xt = x.float().transpose(1,2)
        # xt -> (batch, channel, freq, time)
        xt, lengths = self.spec(xt, lengths)                

        # (batch, channel, freq, time)
        xt = self.net['convs'](xt)
        lengths = self.modify_lengths(lengths)

        # xt -> (batch, time, freq, channel)
        x = xt.transpose(1, -1)

        # xt -> (batch, time, channel*freq)
        batch, time = x.size()[:2]
        x = x.reshape(batch, time, -1)
        x_pack = torch.nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True)
    
        # x -> (batch, time, lstm_out)
        x_pack, hidden = self.net['recur'](x_pack)
        x, _ = torch.nn.utils.rnn.pad_packed_sequence(x_pack, batch_first=True)
        
        # (batch, lstm_out)
        x = self._many_to_one(x, lengths)
        # (batch, classes)
        x = self.net['dense'](x)

        x = F.log_softmax(x, dim=1)

        return x

    def predict(self, x):
        with torch.no_grad():
            out_raw = self.forward( x )
            out = torch.exp(out_raw)
            max_ind = out.argmax().item()        
            return self.classes[max_ind], out[:,max_ind].item()


class AudioCNN(AudioCRNN):

    def forward(self, batch):
        x, _, _ = batch
        # x-> (batch, channel, time)
        x = x.float().transpose(1,2)
        # x -> (batch, channel, freq, time)
        x = self.spec(x)                

        # (batch, channel, freq, time)
        x = self.net['convs'](x)

        # x -> (batch, time*freq*channel)
        x = x.view(x.size(0), -1)
        # (batch, classes)
        x = self.net['dense'](x)

        x = F.log_softmax(x, dim=1)

        return x


class AudioRNN(AudioCRNN):

    def forward(self, batch):    
        # x-> (batch, time, channel)
        x, lengths, _ = batch # unpacking seqs, lengths and srs

        # x-> (batch, channel, time)
        x = x.float().transpose(1,2)
        # x -> (batch, channel, freq, time)
        x, lengths = self.spec(x, lengths)                

        # x -> (batch, time, freq, channel)
        x = x.transpose(1, -1)

        # x -> (batch, time, channel*freq)
        batch, time = x.size()[:2]
        x = x.reshape(batch, time, -1)
        x_pack = torch.nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True)
    
        # x -> (batch, time, lstm_out)
        x_pack, hidden = self.net['recur'](x_pack)
        x, _ = torch.nn.utils.rnn.pad_packed_sequence(x_pack, batch_first=True)
        
        # (batch, lstm_out)
        x = self._many_to_one(x, lengths)
        # (batch, classes)
        x = self.net['dense'](x)

        x = F.log_softmax(x, dim=1)

        return x

In [16]:
model = AudioCRNN(classes=[0, 1, 2, 3]).cuda()

In [17]:
model

AudioCRNN(
  (spec): MelspectrogramStretch(num_mels=128, fft_length=2048, norm=spec_whiten, stretch_param=[0.4, 0.4])
  (net): ModuleDict(
    (convs): Sequential(
      (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=[0, 0])
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ELU(alpha=1.0)
      (3): MaxPool2d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
      (4): Dropout(p=0.1)
      (5): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=[0, 0])
      (6): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (7): ELU(alpha=1.0)
      (8): MaxPool2d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
      (9): Dropout(p=0.1)
      (10): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=[0, 0])
      (11): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (12): ELU(alpha=1.0)
      (13): MaxPool2d(kernel

In [18]:
def eval_metrics(output, target, metrics):
    acc_metrics = np.zeros(len(metrics))
    for i, metric in enumerate(metrics):
        #import pdb;pdb.set_trace()
        acc_metrics[i] += metric(output, target)

        writer.add_scalar("%s"%metric.__name__, acc_metrics[i])
        writer.add_scalar('{}'.format(metric.__name__), acc_metrics[i])
    return acc_metrics

In [19]:
def accuracy(output, target, percent=0.1):
    with torch.no_grad():

        assert output.shape[0] == len(target)
        preds = torch.argmax(output,dim=1)
        tp = 0
        tp = torch.sum(preds == target).item()

    return tp / len(target)

In [20]:
loss_fn = nn.NLLLoss()

# specify optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.002, weight_decay=0.01, amsgrad=True)

lr_scheduler = StepLR(optimizer, step_size=10, gamma=0.5)

In [21]:
def save_checkpoint(epoch, save_best=False):
    """
    Saving checkpoints

    :param epoch: current epoch number
    :param log: logging information of the epoch
    :param save_best: if True, rename the saved checkpoint to 'model_best.pth'
    """
    arch = type(model).__name__
    state = {
        'arch': arch,
        'epoch': epoch,
        'state_dict': model.state_dict(),
        'optimizer': optimizer.state_dict(),
        'monitor_best': mnt_best,
        'classes':model.classes
    }

    filename = os.path.join(checkpoint_dir, 'checkpoint-current.pth')
    #filename = os.path.join(self.checkpoint_dir, 'checkpoint-epoch{}.pth'.format(epoch))
    torch.save(state, filename)
    logger.info("Saving checkpoint: {} ...".format(filename))
    if save_best:
        best_path = os.path.join(checkpoint_dir, 'model_best.pth')
        torch.save(state, best_path)
        logger.info("Saving current best: {} ...".format('model_best.pth'))
        logger.info("[IMPROVED]")

In [22]:
def valid_epoch(epoch):
    model.eval()
    total_val_loss = 0
    total_val_metrics = np.zeros(len(metrics))


    writer.set_step(epoch, 'valid')        

    with torch.no_grad():

        for batch_idx, batch in enumerate(test_dataloader):
            batch = [b.to("cuda") for b in batch]
            data, target = batch[:-1], batch[-1]
            data = data if len(data) > 1 else data[0] 

            output = model(data)
            loss = loss_fn.forward(output, target)

            # self.writer.set_step((epoch - 1) * len(self.valid_data_loader) + batch_idx, 'valid')
            # self.writer.add_scalar('loss', loss.item())

            total_val_loss += loss.item()
            total_val_metrics += eval_metrics(output, target, metrics)

            #self.writer.add_image('input', make_grid(data.cpu(), nrow=8, normalize=True))


        # Add epoch metrics
        val_loss = total_val_loss / len(test_dataloader)
        val_metrics = (total_val_metrics / len(test_dataloader)).tolist()
        
        writer.add_scalar('loss', val_loss)
        for i, metric in enumerate(metrics):
            writer.add_scalar("%s"%metric.__name__, val_metrics[i])

    model.train()
    return {
        'val_loss': val_loss,
        'val_metrics':val_metrics
        }

In [None]:
# number of epochs to train the model
n_epochs = 150  # suggest training between 20-50 epochs

start_time = datetime.datetime.now().strftime('%m%d_%H%M%S')
checkpoint_dir = os.path.join("saved_cv", start_time, 'checkpoints')
log_dir = os.path.join("saved_cv", start_time, 'logs')

logger = logging.getLogger("SuperLogger")


writer = WriterTensorboardX(log_dir, logger, True)

# Save configuration file into checkpoint directory:
mkdir_p(checkpoint_dir)

mnt_mode, mnt_metric = "min", "val_loss"

mnt_best = math.inf if mnt_mode == 'min' else -math.inf
early_stop = 40




model.train() # prep model for training

metrics = [accuracy]

for epoch in range(n_epochs):
    # monitor training loss
    train_loss = 0.0
    total_metrics = np.zeros(len(metrics))
    writer.set_step(epoch) 
    
    
    
    
    
    ###################
    # train the model #
    ###################
    _trange = tqdm(dataloader, leave=True, desc='')

    for batch_idx, batch in enumerate(_trange):
        batch = [b.to("cuda") for b in batch]
        data, target = batch[:-1], batch[-1]
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(data)
        #print(output, target)
        # calculate the loss
        loss = loss_fn.forward(output, target)
        
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer.step()
        # update running training loss
        #import pdb; pdb.set_trace()
        train_loss += loss.item()
        total_metrics += eval_metrics(output, target, metrics)
        
        if batch_idx % int(np.sqrt(dataloader.batch_size)) == 0:                
                _str = 'Train Epoch: {} Loss: {:.6f}'.format(epoch,loss.item()) 
                _trange.set_description(_str)
        
    # print training statistics 
    # calculate average loss over an epoch
    # Add epoch metrics
    loss = train_loss / len(dataloader)
    metric_epoch = (total_metrics / len(dataloader)).tolist()

    writer.add_scalar('loss', loss)
    for i, metric in enumerate(metrics):
        writer.add_scalar("%s"%metric.__name__, metric_epoch[i])

    log = {
        'loss': loss,
        'metrics': metric_epoch
    }
    
    print('train')
    print(log)
    print('test')
    
    val_log = valid_epoch(epoch)
    print(val_log)
    log = {**log, **val_log}

    if lr_scheduler is not None:
        lr_scheduler.step()
    
    c_lr = optimizer.param_groups[0]['lr']
        
    
    tot_log = {'epoch': epoch}
    for key, value in log.items():
        if key == 'metrics':
            tot_log.update({mtr.__name__ : value[i] for i, mtr in enumerate(metrics)})
        elif key == 'val_metrics':
            tot_log.update({'val_' + mtr.__name__ : value[i] for i, mtr in enumerate(metrics)})
        else:
            tot_log[key] = value
            
    writer.add_scalar('lr', c_lr)
    
    
    
    
    best = False
    try:
        # check whether model performance improved or not, according to specified metric(mnt_metric)
        improved = (mnt_mode == 'min' and tot_log[mnt_metric] < mnt_best) or \
                   (mnt_mode == 'max' and tot_log[mnt_metric] > mnt_best)
    except KeyError:
        logger.warning("Warning: Metric '{}' is not found. Model performance monitoring is disabled.".format(mnt_metric))
        mnt_mode = 'off'
        improved = False
        not_improved_count = 0

    if improved:
        mnt_best = tot_log[mnt_metric]
        not_improved_count = 0
        best = True
    else:
        not_improved_count += 1

    if not_improved_count > early_stop:
        logger.info("Validation performance didn\'t improve for {} epochs. Training stops.".format(early_stop))
        break

    if len(writer) > 0:
        logger.info(
            '\nRun TensorboardX:\ntensorboard --logdir={}\n'.format(log_dir))

    if epoch % 1 == 0:
        save_checkpoint(epoch, save_best=best)


HBox(children=(IntProgress(value=0, max=125), HTML(value='')))

train
{'loss': 0.9813162775039673, 'metrics': [0.542875]}
test
{'val_loss': 1.0303447898477316, 'val_metrics': [0.5146484375]}


HBox(children=(IntProgress(value=0, max=125), HTML(value='')))

train
{'loss': 0.8658858628273011, 'metrics': [0.601875]}
test
{'val_loss': 0.9524890128523111, 'val_metrics': [0.56689453125]}


HBox(children=(IntProgress(value=0, max=125), HTML(value='')))

In [None]:
torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss_fn,
            'lr_scheduler': lr_scheduler.state_dict(),
            }, 'saved_model.pth')

In [None]:
model.eval()

In [None]:
tt = next(iter(dataloader))
tt = [t.to("cuda") for t in tt]

In [None]:
data, target = tt[:-1], tt[-1]
data = data if len(data) > 1 else data[0] 
output = model(data)

In [None]:
output, targe

In [None]:
## Define the NN architecture
class Net(nn.Module):
    def __init__(self, n_features, n_hid, dropout):
        super(Net, self).__init__()
        #self.ML = Melspectrogram(fft_length = 2048, sample_rate=22050)
        self.layers = nn.Sequential(
            #nn.Dropout(dropout),
            nn.Linear(n_features, 128),
            nn.Tanh(),
            #nn.BatchNorm1d(n_hid),
            #nn.Dropout(dropout),            
            nn.Linear(128, 64),
            nn.Tanh(),
            #nn.BatchNorm1d(n_hid // 4),
            #nn.Dropout(dropout),
            nn.Linear(64, 4),
            nn.Softmax()
        )

    def forward(self, x):
        #print(x.shape)
        #x = self.ML(x)
        #print(x.shape)
        batch_size = x.shape[0]
        x = x.contiguous().view(batch_size,-1)
        #print(x.shape)
        x = self.layers(x)
        return x
      
# initialize the NN
model = Net(5632, 250, 0.1).cuda()
print(model)

In [None]:
loss_fn = nn.CrossEntropyLoss()

# specify optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=0.05)

In [None]:
# number of epochs to train the model
n_epochs = 3  # suggest training between 20-50 epochs

model.train() # prep model for training

lambda1 = 0.07
lambda2 = 0.9
alpha1 = 0.0001
alpha2 = 1


for epoch in range(n_epochs):
    # monitor training loss
    train_loss = 0.0
    
    ###################
    # train the model #
    ###################
    for data, target in dataloader:
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        data = data.cuda()
        target = target.cuda()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(data)
        #print(output, target)
        # calculate the loss
        loss = loss_fn.forward(output, target-1)
        
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer.step()
        # update running training loss
        #import pdb; pdb.set_trace()
        train_loss += loss.item()
        print(train_loss)
        
    # print training statistics 
    # calculate average loss over an epoch
    train_loss = train_loss/len(dataloader.dataset)

    print('Epoch: {} \tTraining Loss: {:.6f}'.format(
        epoch+1, 
        train_loss
        ))