In [1]:
# Parameters
until_x = 2


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]


def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()


class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 96, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 96, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 96, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
# Instantiate the model
model = Task5Model(31).to(device)

In [12]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [13]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 15:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6521854066848755 0.5822560310363769
Epoch:  1


0.4309770941734314 0.3126558601856232
Epoch:  2


0.22625182092189788 0.1883280873298645
Epoch:  3


0.18330811560153962 0.17510195970535278
Epoch:  4


0.1767992502450943 0.15772006213665007
Epoch:  5


0.17271261930465698 0.16955406665802003
Epoch:  6


0.1710117256641388 0.15991236567497252
Epoch:  7


0.17023927748203277 0.150660902261734
Epoch:  8


0.16691453218460084 0.1480327546596527
Epoch:  9


0.16715433001518248 0.16196925342082977
Epoch:  10


0.16387353956699371 0.14242788553237914
Epoch:  11


0.16247423350811005 0.14593563079833985
Epoch:  12


0.16221482932567596 0.14320239126682283
Epoch:  13


0.16017019093036652 0.14537941217422484
Epoch:  14


0.16063049912452698 0.13483766317367554
Epoch:  15


0.1589711129665375 0.14019994735717772
Epoch:  16


0.15867773771286012 0.1466401070356369
Epoch:  17


0.15762884497642518 0.1347535103559494
Epoch:  18


0.15840909898281097 0.13685569614171983
Epoch:  19


0.1564149832725525 0.13431403934955596
Epoch:  20


0.15617566108703612 0.13405018150806428
Epoch:  21


0.15682647109031678 0.13086087107658387
Epoch:  22


0.1544857567548752 0.13257291913032532
Epoch:  23


0.15488975882530212 0.13484843373298644
Epoch:  24


0.15565379619598388 0.13041550368070604
Epoch:  25


0.15458730101585388 0.1307341754436493
Epoch:  26


0.15438727378845216 0.1316344618797302
Epoch:  27


0.1547226071357727 0.13142624199390412
Epoch:  28


0.15446260273456575 0.1299199715256691
Epoch:  29


0.1531117796897888 0.1340964302420616
Epoch:  30


0.1533650940656662 0.13518451005220414
Epoch:  31


0.15186841607093812 0.12826205492019654
Epoch:  32


0.15305612206459046 0.13019149154424667
Epoch:  33


0.1534897530078888 0.12828124165534974
Epoch:  34


0.15224810361862182 0.13124027997255325
Epoch:  35


0.15189466178417205 0.1279750406742096
Epoch:  36


0.15252774238586425 0.12858833819627763
Epoch:  37


0.15176943361759185 0.12689449340105058
Epoch:  38


0.15023768365383147 0.13021322935819626
Epoch:  39


0.15334569454193114 0.12982620000839235
Epoch:  40


0.15054392874240874 0.12761698216199874
Epoch:  41


0.15142225444316865 0.12900580912828447
Epoch:  42


0.14919954001903535 0.1290869042277336
Epoch:  43


0.14909207999706267 0.1290661185979843
Epoch    43: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  44


0.14950319170951842 0.12479275315999985
Epoch:  45


0.1490427190065384 0.12518837600946425
Epoch:  46


0.14955606579780578 0.12515528947114946
Epoch:  47


0.14780801653862 0.12447333186864853
Epoch:  48


0.14823819518089296 0.12467252910137176
Epoch:  49


0.14842062652111054 0.12427377998828888
Epoch:  50


0.1480450475215912 0.12399738729000091
Epoch:  51


0.14674893260002136 0.12434176057577133
Epoch:  52


0.1477384662628174 0.12407976537942886
Epoch:  53


0.14725729405879975 0.12401725500822067
Epoch:  54


0.14748814582824707 0.12424307763576507
Epoch:  55


0.14761744022369386 0.12412195503711701
Epoch:  56


0.14664308726787567 0.12406691163778305
Epoch    56: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  57


0.1481398469209671 0.12400144040584564
Epoch:  58


0.14733923196792603 0.12391374260187149
Epoch:  59


0.14688200414180755 0.12380803823471069
Epoch:  60


0.14611486554145814 0.123955737054348
Epoch:  61


0.14722223103046417 0.12382982224225998
Epoch:  62


0.14744139730930328 0.12405849695205688
Epoch:  63


0.1461624151468277 0.12395346462726593
Epoch:  64


0.14612244427204132 0.12383700311183929
Epoch:  65


0.14618613719940185 0.12399670332670212
Epoch    65: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  66


0.14571299850940705 0.1238915592432022
Epoch:  67


0.1461608165502548 0.12386782616376876
Epoch:  68


0.1474409019947052 0.12374176532030105
Epoch:  69


0.14665946662425994 0.12378368377685547
Epoch:  70


0.1462584763765335 0.1237529993057251
Epoch:  71


0.14669287145137788 0.12405551075935364
Epoch:  72


0.1477050256729126 0.1239153727889061
Epoch:  73


0.14651399612426758 0.12423321306705475
Epoch:  74


0.14671824753284454 0.12386804819107056
Epoch    74: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  75


0.14677367508411407 0.12404572665691375
Epoch:  76


0.14686240673065185 0.12397892773151398
Epoch:  77


0.14661842107772827 0.1240443006157875
Epoch:  78


0.14757500171661378 0.12379592657089233
Epoch:  79


0.14608902275562285 0.123848956823349
Epoch:  80


0.14679610788822173 0.1239173486828804
Epoch    80: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  81


0.14595671653747558 0.12405903339385986
Epoch:  82


0.1471814638376236 0.12394913136959076
Epoch:  83
