In [1]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [2]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [3]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [4]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [5]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [6]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [7]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [8]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [9]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [10]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [11]:
until_x = None

In [12]:
# Parameters
until_x = 6


In [13]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [14]:
# Instantiate the model
model = Task5Model(31).to(device)

In [15]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [16]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6346722309653824 0.4886224695614406
Epoch:  1


0.3346064106838123 0.2008644938468933
Epoch:  2


0.18821315467357635 0.16923268352236068
Epoch:  3


0.17706194721363686 0.16641149137701308
Epoch:  4


0.17351532546249596 0.16119491841111863
Epoch:  5


0.17401006374810193 0.15778728680951254
Epoch:  6


0.1711261441578736 0.15070314066750662
Epoch:  7


0.1672944958145554 0.15224005494798934
Epoch:  8


0.166593998670578 0.15115490768636977
Epoch:  9


0.16667044001656608 0.14698361498968943
Epoch:  10


0.16502567038342758 0.1437239476612636
Epoch:  11


0.16316065393589638 0.1403627853308405
Epoch:  12


0.16384731434487007 0.14145721388714655
Epoch:  13


0.16073206389272535 0.13784918827669962
Epoch:  14


0.16170636906817154 0.14264445645468576
Epoch:  15


0.16043692746677915 0.13930439416851317
Epoch:  16


0.15950284938554507 0.1365287144269262
Epoch:  17


0.15965068541668556 0.13547250096287047
Epoch:  18


0.15832950940003265 0.13397518119641713
Epoch:  19


0.15758386334857424 0.1330446249672345
Epoch:  20


0.15784265866150726 0.13536670484713145
Epoch:  21


0.15776319842080813 0.13435174099036626
Epoch:  22


0.15678320301545634 0.13659720016377314
Epoch:  23


0.15745376453206344 0.13135137728282384
Epoch:  24


0.1563000981066678 0.13046804070472717
Epoch:  25


0.1564434126422212 0.13646094926765986
Epoch:  26


0.15666842500905734 0.13375930381672724
Epoch:  27


0.15518231246922468 0.1295713803597859
Epoch:  28


0.15461932686535088 0.13654584331171854
Epoch:  29


0.15455299736680211 0.1323873262320246
Epoch:  30


0.1548977735880259 0.1336219310760498
Epoch:  31


0.1544828040374292 0.13260180290256227
Epoch:  32


0.15295448536808426 0.13112098830086844
Epoch:  33


0.15484411450656685 0.13031593710184097
Epoch    33: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  34


0.1529479425501179 0.12724964852843965
Epoch:  35


0.15154152504495672 0.12536394063915526
Epoch:  36


0.1510862055662516 0.1257558496935027
Epoch:  37


0.14974159445311572 0.12528091030461447
Epoch:  38


0.15067235320001035 0.12539210596254893
Epoch:  39


0.14977157357576731 0.12500970278467452
Epoch:  40


0.1498350058052991 0.1246886860047068
Epoch:  41


0.1495945920815339 0.12390785451446261
Epoch:  42


0.1495260169377198 0.12489771630082812
Epoch:  43


0.14926936578106237 0.12441337215048927
Epoch:  44


0.15051864409768903 0.12425212774957929
Epoch:  45


0.1497786008828395 0.12484112807682582
Epoch:  46


0.14996310383886904 0.12471108244998115
Epoch:  47


0.14938514458166585 0.12419529472078596
Epoch    47: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  48


0.1477960098434139 0.12411672621965408
Epoch:  49


0.14872686202461655 0.12438934189932686
Epoch:  50


0.14903253437699499 0.12400013634136745
Epoch:  51


0.14809277291233475 0.12399462184735707
Epoch:  52


0.1486076869674631 0.12416861525603703
Epoch:  53


0.14834781756272186 0.12404539755412511
Epoch    53: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  54


0.14949076763681463 0.12424222167049136
Epoch:  55


0.14861977744746852 0.12416298793894905
Epoch:  56


0.1492317250451526 0.12447324075869151
Epoch:  57


0.14948603228942767 0.12422653287649155
Epoch:  58


0.148534446149259 0.12390363748584475
Epoch:  59


0.14915220117246783 0.12380730999367577
Epoch:  60


0.14908977618088592 0.12426581659487315
Epoch:  61


0.14927860732014114 0.12389689151729856
Epoch:  62


0.14836665021406636 0.12404281858886991
Epoch:  63


0.1484107185859938 0.12377700954675674
Epoch:  64


0.14806711109908852 0.12421752618891853
Epoch:  65


0.1486268933560397 0.12396543047257833
Epoch:  66


0.14931373217621366 0.1240212938615254
Epoch:  67


0.14979522413498647 0.12421820099864687
Epoch:  68


0.14905969395830826 0.12404554550136838
Epoch:  69


0.14902483651766907 0.12390658578702382
Epoch    69: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  70


0.14794206095708384 0.12403139046260289
Epoch:  71


0.1487650557144268 0.12420182887996946
Epoch:  72


0.14821022748947144 0.12405278640133995
Epoch:  73


0.14878152955222773 0.1240387995328222
Epoch:  74


0.14840515199545268 0.12414638910974775
Epoch:  75


0.14846524918401563 0.12408011300223214
Epoch    75: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  76


0.14692815292525935 0.12363811582326889
Epoch:  77


0.14940876493582855 0.12391345309359687
Epoch:  78


0.1483384161382108 0.12395549778427396
Epoch:  79


0.14855569681605776 0.1238507724234036
Epoch:  80


0.14848128444439657 0.12387097307613917
Epoch:  81


0.148813835269696 0.12402727454900742
Epoch:  82


0.1476668320797585 0.12405270657369069
Epoch:  83


0.14796008773752162 0.12413598597049713
Epoch:  84


0.1483170510949315 0.12410842095102582
Epoch:  85


0.14921928539469437 0.12431488611868449
Epoch:  86


0.14811687292279424 0.1242378471153123
Epoch:  87


0.1493121880937267 0.12390289562089103
Epoch:  88


0.148707044688431 0.12413713868175234
Epoch:  89


0.14802006491132685 0.1239611314875739
Epoch:  90


0.1491678933034072 0.12391937736954008
Epoch:  91


0.14780758321285248 0.12373393880469459
Epoch:  92


0.14925796518454681 0.12409183915172305
Epoch:  93


0.14801528365225405 0.12403260916471481
Epoch:  94


0.14885261171572917 0.12418729066848755
Epoch:  95


0.1477240847574698 0.12384355068206787
Epoch:  96


0.14745761413831968 0.12386442188705717
Epoch:  97


0.14979684151507713 0.12396577958549772
Epoch:  98


0.14893115251450925 0.12401373258658818
Epoch:  99


0.1477363319010348 0.12385329284838267
