In [1]:
# Parameters
until_x = 1


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 96, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 96, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 96, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [12]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [13]:
# Instantiate the model
model = Task5Model(31).to(device)

In [14]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6537944531440735 0.5684651017189026
Epoch:  1


0.43667855381965637 0.2863760471343994
Epoch:  2


0.23068874776363374 0.189159819483757
Epoch:  3


0.18730640172958374 0.19939213395118713
Epoch:  4


0.17842639446258546 0.1733480930328369
Epoch:  5


0.1755499243736267 0.16626548171043395
Epoch:  6


0.1719481533765793 0.16895413696765899
Epoch:  7


0.16967013895511626 0.17173781096935273
Epoch:  8


0.1718115746974945 0.17943574786186217
Epoch:  9


0.1686219894886017 0.159161713719368
Epoch:  10


0.16859659910202027 0.15552860796451567
Epoch:  11


0.1659306091070175 0.1544829100370407
Epoch:  12


0.1654346454143524 0.14720951318740844
Epoch:  13


0.16591443240642548 0.14784944355487822
Epoch:  14


0.16419426023960113 0.15501455068588257
Epoch:  15


0.16517928063869478 0.15208386182785033
Epoch:  16


0.1619093370437622 0.1512719064950943
Epoch:  17


0.16349753022193908 0.14562704861164094
Epoch:  18


0.16324878871440887 0.14049959182739258
Epoch:  19


0.16135647475719453 0.1445743441581726
Epoch:  20


0.16011246860027314 0.1402737617492676
Epoch:  21


0.16088784277439117 0.13534463346004486
Epoch:  22


0.15951723158359526 0.13949187248945236
Epoch:  23


0.15700398564338683 0.1368791341781616
Epoch:  24


0.15825463712215423 0.13637946546077728
Epoch:  25


0.15805511832237243 0.13595810830593108
Epoch:  26


0.15902500212192536 0.1348387062549591
Epoch:  27


0.15737786948680876 0.13525698333978653
Epoch:  28


0.15744123756885528 0.13319691270589828
Epoch:  29


0.1562056654691696 0.13534144014120103
Epoch:  30


0.1566835767030716 0.13759399056434632
Epoch:  31


0.15781608879566192 0.13717738091945647
Epoch:  32


0.15622346580028534 0.13602936267852783
Epoch:  33


0.15527620613574983 0.13176248073577881
Epoch:  34


0.1561884480714798 0.13416125178337096
Epoch:  35


0.1553796112537384 0.13443365693092346
Epoch:  36


0.15465474665164947 0.13353383243083955
Epoch:  37


0.15396809875965117 0.13322529345750808
Epoch:  38


0.1546115618944168 0.1368766576051712
Epoch:  39


0.15593572080135346 0.13616099208593369
Epoch    39: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  40


0.15416715919971466 0.1289563626050949
Epoch:  41


0.15325696349143983 0.1287037193775177
Epoch:  42


0.15307840585708618 0.12814747989177705
Epoch:  43


0.15269907236099242 0.12686969637870787
Epoch:  44


0.15153813540935515 0.12700097262859344
Epoch:  45


0.15173736929893494 0.12718241214752196
Epoch:  46


0.15284441888332367 0.12716103047132493
Epoch:  47


0.15217188358306885 0.12706205248832703
Epoch:  48


0.15232418477535248 0.12701085954904556
Epoch:  49


0.15114355862140655 0.12648554444313048
Epoch:  50


0.15093586444854737 0.12694364488124849
Epoch:  51


0.15121486604213716 0.12685295790433884
Epoch:  52


0.1514064109325409 0.12673803567886352
Epoch:  53


0.15213705182075501 0.12662031203508378
Epoch:  54


0.15100910246372223 0.12697499096393586
Epoch:  55


0.1516590315103531 0.12684227377176285
Epoch    55: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  56


0.15014172077178956 0.12670717686414718
Epoch:  57


0.15081071436405183 0.12635007947683335
Epoch:  58


0.1504259103536606 0.12637637853622435
Epoch:  59


0.15092734217643738 0.1264600023627281
Epoch:  60


0.15171347677707672 0.1265190437436104
Epoch:  61


0.1505168604850769 0.12657159715890884
Epoch:  62


0.15099284589290618 0.12652585208415984
Epoch:  63


0.15158306658267975 0.1264270067214966
Epoch    63: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  64


0.15123923540115355 0.12639211416244506
Epoch:  65


0.15105698585510255 0.1261588916182518
Epoch:  66


0.15023880004882811 0.1264320731163025
Epoch:  67


0.14980554819107056 0.12628928571939468
Epoch:  68


0.15081504940986634 0.12648507803678513
Epoch:  69


0.14920592486858367 0.12636008262634277
Epoch:  70


0.1511746346950531 0.12608998268842697
Epoch:  71


0.1503926223516464 0.1263471230864525
Epoch:  72


0.15103795409202575 0.12621119916439055
Epoch:  73


0.15171145141124726 0.1262631595134735
Epoch:  74


0.14933591425418855 0.12631429880857467
Epoch:  75


0.15065973937511445 0.12627648711204528
Epoch:  76


0.15086518585681916 0.1262725591659546
Epoch    76: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  77


0.14996601819992064 0.1262020856142044
Epoch:  78


0.15115482449531556 0.12628642916679383
Epoch:  79


0.1498010402917862 0.12633707672357558
Epoch:  80


0.1496543687582016 0.12641917467117308
Epoch:  81


0.1495860105752945 0.1262933596968651
Epoch:  82


0.150932919383049 0.12614931762218476
Epoch    82: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  83


0.15021284759044648 0.12622748762369157
Epoch:  84


0.15084387183189393 0.12638289630413055
Epoch:  85


0.15090230286121367 0.12646857053041458
Epoch:  86


0.15068381011486054 0.1261053964495659
Epoch:  87


0.15193451285362244 0.12643562704324723
Epoch:  88


0.15103623330593108 0.12637490034103394
Epoch:  89


0.15037926256656647 0.12634607255458832
Epoch:  90


0.15069901883602144 0.12643101811408997
Epoch:  91


0.15265438854694366 0.12620247304439544
Epoch:  92


0.14982978999614716 0.12637498378753662
Epoch:  93


0.15021432757377626 0.1262641206383705
Epoch:  94


0.15116310775279998 0.12632406651973724
Epoch:  95
