In [1]:
# Parameters
until_x = 5


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [12]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [13]:
# Instantiate the model
model = Task5Model(31).to(device)

In [14]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6084714088891003 0.39024095450128826
Epoch:  1


0.28216278794649485 0.19942122484956468
Epoch:  2


0.18057541831119642 0.16920818175588334
Epoch:  3


0.1710783521065841 0.16428435274532863
Epoch:  4


0.1662845015525818 0.1938800811767578
Epoch:  5


0.1630036254992356 0.14781642385891505
Epoch:  6


0.1611119718165011 0.14758510249001638
Epoch:  7


0.1617759570882127 0.1480355135032109
Epoch:  8


0.15996268230515556 0.1370909320456641
Epoch:  9


0.15865143932200768 0.13576522575957434
Epoch:  10


0.15726463979965932 0.14563090673514775
Epoch:  11


0.15833420608494733 0.13419342041015625
Epoch:  12


0.1562122609969732 0.13544949676309312
Epoch:  13


0.155224434830047 0.14101115933486394
Epoch:  14


0.1556276215894802 0.1405865784202303
Epoch:  15


0.15466898478366234 0.15289335165705
Epoch:  16


0.1547954219418603 0.1707575704370226
Epoch:  17


0.15210764472549027 0.13687732922179358
Epoch    17: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  18


0.15157185696266792 0.12708560590233123
Epoch:  19


0.1515064863739787 0.1262184509209224
Epoch:  20


0.15114975861600927 0.12611684522458486
Epoch:  21


0.15069090997850573 0.12506112030574254
Epoch:  22


0.15063362347113118 0.1254437959619931
Epoch:  23


0.1510801033393757 0.12528965728623526
Epoch:  24


0.15057394432054982 0.12485648904527936
Epoch:  25


0.1503935484467326 0.12443250098398753
Epoch:  26


0.14987417531980052 0.1253213243825095
Epoch:  27


0.1500891004059766 0.12557385861873627
Epoch:  28


0.14986065875839544 0.1253004584993635
Epoch:  29


0.14935502409934998 0.12490610246147428
Epoch:  30


0.15073279351801486 0.1257586862359728
Epoch:  31


0.14932541267291918 0.1239161981003625
Epoch:  32


0.15014478242075122 0.12476871801274163
Epoch:  33


0.14942464554632032 0.12408810321773801
Epoch:  34


0.1499646185217677 0.12416037704263415
Epoch:  35


0.14981700399437467 0.12351161347968238
Epoch:  36


0.14994343954163628 0.12450980714389256
Epoch:  37


0.1482715429486455 0.12438686830656868
Epoch:  38


0.14846880492326375 0.12386733932154519
Epoch:  39


0.1491578805285531 0.12331698302711759
Epoch:  40


0.14941737418239182 0.1241058920110975
Epoch:  41


0.14930872981612747 0.12426393585545677
Epoch:  42


0.14877283532877225 0.124118230172566
Epoch:  43


0.14840771620338028 0.12366591819695064
Epoch:  44


0.14834225137491483 0.12400708560432706
Epoch:  45


0.14764991885906942 0.1240776926279068
Epoch    45: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  46


0.1489494660416165 0.12374134893928256
Epoch:  47


0.14725040704817385 0.12403256339686257
Epoch:  48


0.1470743039169827 0.1237269618681499
Epoch:  49


0.14882490361059034 0.12344760341303689
Epoch:  50


0.14846815128584165 0.1236860773393086
Epoch:  51


0.14715157086784775 0.12354840763977595
Epoch    51: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  52


0.14931871278865919 0.12340871563979558
Epoch:  53


0.14722600458441554 0.1233249625989369
Epoch:  54


0.14801090512726758 0.12355505568640572
Epoch:  55


0.14687582689362602 0.1236401881490435
Epoch:  56


0.1474312181408341 0.12338582639183317
Epoch:  57


0.1479587043459351 0.12331441257681165
Epoch    57: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  58


0.14831243978964315 0.12355847018105644
Epoch:  59


0.14772675609266436 0.12348857309137072
Epoch:  60


0.14819385957073522 0.1232889752302851
Epoch:  61


0.14761052139707515 0.12355598488024302
Epoch:  62


0.1478676634865838 0.12360922460045133
Epoch:  63


0.14749310427420848 0.1235251362834658
Epoch:  64


0.14830016284375577 0.12346827877419335
Epoch:  65


0.14955022689458486 0.12350592230047498
Epoch:  66


0.1484628392232431 0.12348918829645429
Epoch    66: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  67


0.14845918884148468 0.12344042211771011
Epoch:  68


0.14700156853005691 0.12327264249324799
Epoch:  69


0.14747399937462163 0.12342125603130885
Epoch:  70


0.14765451365226023 0.1236023445214544
Epoch:  71


0.14680333918816335 0.12339424235480172
Epoch:  72


0.147609200429272 0.12340110008205686
Epoch:  73


0.14766552601311658 0.12343935987779073
Epoch:  74


0.14927656384738716 0.12359185623271125
Epoch:  75


0.14828059117536288 0.12361349271876472
Epoch:  76


0.14836616572496053 0.1235217856509345
Epoch:  77


0.1485502172160793 0.12320184601204735
Epoch:  78


0.14893691000100728 0.12380312702485494
Epoch:  79


0.14966369118239428 0.12324828015906471
Epoch:  80


0.14827499236609484 0.12345286032983235
Epoch:  81


0.14708218864492467 0.12358803089175906
Epoch:  82


0.14823952840792165 0.12334591043846947
Epoch:  83


0.14779572712408529 0.12347086944750377
Epoch:  84


0.14780514828256658 0.12352381646633148
Epoch:  85


0.1472391566714725 0.12324265816381999
Epoch:  86


0.14895942082276215 0.12359983367579323
Epoch:  87


0.1476330153039984 0.12342213307108198
Epoch:  88


0.14761925669940743 0.12337786704301834
Epoch:  89


0.14868207114773827 0.12322216480970383
Epoch:  90


0.14774273899761406 0.12370881651129041
Epoch:  91


0.14831369390358795 0.12318651697465352
Epoch:  92


0.14939004303635778 0.12325462060315269
Epoch:  93


0.1472532261867781 0.1234897928578513
Epoch:  94


0.14786830101464246 0.12346677375691277
Epoch:  95


0.14810575947568222 0.12335344510419029
Epoch:  96


0.14868810329888318 0.12353774798767907
Epoch:  97


0.14744886675396482 0.12341680484158653
Epoch:  98


0.14721151582292608 0.12348368231739316
Epoch:  99


0.14750173647661466 0.1233992480805942
