In [1]:
# Parameters
until_x = 16


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [12]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [13]:
# Instantiate the model
model = Task5Model(31).to(device)

In [14]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6088473821008528 0.42725219471114023
Epoch:  1


0.27827730251325145 0.19239966358457292
Epoch:  2


0.17349419400498672 0.14264111753020967
Epoch:  3


0.1654756375261255 0.1382554450205394
Epoch:  4


0.1600395932390883 0.13792011780398233
Epoch:  5


0.15736361411777702 0.13473768212965556
Epoch:  6


0.15759929813243248 0.1360893611397062
Epoch:  7


0.15648970773091186 0.13527493391718184
Epoch:  8


0.15492114706619367 0.1291902427162443
Epoch:  9


0.1542158622193981 0.1377613523176738
Epoch:  10


0.1547666002769728 0.13209758486066545
Epoch:  11


0.15293320288529266 0.13015634885856084
Epoch:  12


0.15219538195713148 0.13352188361542566
Epoch:  13


0.15168129068774147 0.12858484366110393
Epoch:  14


0.15127092559595365 0.12751650490931102
Epoch:  15


0.15210098310096845 0.13074749601738794
Epoch:  16


0.14980039161604805 0.12743964684861048
Epoch:  17


0.15071933132571144 0.13890423945018224
Epoch:  18


0.1515758593340178 0.13028130680322647
Epoch:  19


0.14929071914505315 0.13959965535572597
Epoch:  20


0.1495900492410402 0.12530903092452458
Epoch:  21


0.1492691257515469 0.1296114410672869
Epoch:  22


0.1484498619227796 0.13451035427195684
Epoch:  23


0.1477811481501605 0.12400962518794197
Epoch:  24


0.14853878319263458 0.1362137964793614
Epoch:  25


0.1484690328707566 0.12851378640958241
Epoch:  26


0.1464119400527026 0.12772673155580247
Epoch:  27


0.14665082580334432 0.12569808321339743
Epoch:  28


0.14755910030893377 0.12892337569168635
Epoch:  29


0.14701478263816317 0.1251669643180711
Epoch    29: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  30


0.14513168665202889 0.12226921213524682
Epoch:  31


0.14348581836030289 0.12230288769517626
Epoch:  32


0.14424835910668243 0.12234923030648913
Epoch:  33


0.144929485546576 0.12190308634723936
Epoch:  34


0.14279476452518153 0.1217457162482398
Epoch:  35


0.14259328834108403 0.12153297130550657
Epoch:  36


0.14242629102758458 0.12150161074740547
Epoch:  37


0.14343423819219744 0.12223403368677412
Epoch:  38


0.14187351151092634 0.12165975996426173
Epoch:  39


0.14195855062555623 0.12192688243729728
Epoch:  40


0.14149533816286036 0.12177771010569163
Epoch:  41


0.14236189304171382 0.12203006339924675
Epoch:  42


0.1407776652155696 0.12192239399467196
Epoch    42: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  43


0.14126404757435257 0.12149639853409358
Epoch:  44


0.14139015247692932 0.12155786475964955
Epoch:  45


0.14274486093907743 0.12166391951697213
Epoch:  46


0.14150615358674848 0.12154083698987961
Epoch:  47


0.14154158693713112 0.12151465777839933
Epoch:  48


0.14162522232210314 0.12129867609058108
Epoch:  49


0.14108928635313706 0.12149709675993238
Epoch:  50


0.13999833347829613 0.12148040107318334
Epoch:  51


0.14110534537482905 0.12151612660714559
Epoch:  52


0.14128373442469416 0.1214876047202519
Epoch:  53


0.14142857411423246 0.12141977889197213
Epoch:  54


0.14142295919560097 0.12124336617333549
Epoch:  55


0.14221543355568037 0.12127288218055453
Epoch:  56


0.1415786380703385 0.12131243944168091
Epoch:  57


0.14132440492913528 0.1214174936924662
Epoch:  58


0.14159002980670413 0.12127602419682912
Epoch:  59


0.14023321866989136 0.1214088329247066
Epoch:  60


0.14098961168044322 0.12121508909123284
Epoch:  61


0.14208983650078644 0.12153837723391396
Epoch:  62


0.140187876852783 0.12125286992107119
Epoch:  63


0.14102548683011853 0.12146464096648353
Epoch:  64


0.14089022657355746 0.12121536156960896
Epoch:  65


0.14070390124578733 0.12115373356001717
Epoch:  66


0.14163194395400383 0.12143392967326301
Epoch:  67


0.1410004399918221 0.12167046006236758
Epoch:  68


0.14101006171187838 0.12163662591150828
Epoch:  69


0.1407148656812874 0.12128360250166484
Epoch:  70


0.1408311752854167 0.1214238458446094
Epoch:  71


0.14195240812527166 0.12124308943748474
Epoch    71: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  72


0.14109560726462184 0.12139391260487693
Epoch:  73


0.14147794770227895 0.12128572804587227
Epoch:  74


0.14195676671492086 0.12120862624474935
Epoch:  75


0.142237635480391 0.12121223977633885
Epoch:  76


0.14019873214734568 0.1212855013353484
Epoch:  77


0.13970002008451 0.12154791504144669
Epoch    77: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  78


0.14116990163519577 0.12153931920017515
Epoch:  79


0.14039001029890938 0.12141944893768855
Epoch:  80


0.14278181179149732 0.12135178702218193
Epoch:  81


0.14152864266086268 0.12130833097866603
Epoch:  82


0.14098467577148127 0.12108446338347026
Epoch:  83


0.1401558200249801 0.12122882370437894
Epoch:  84


0.1413333460285857 0.12119616887399129
Epoch:  85


0.1398975166517335 0.12123845624072212
Epoch:  86


0.1413712408897039 0.12117593841893333
Epoch:  87


0.1406575485661223 0.12127022870949336
Epoch:  88


0.1401086099244453 0.12129169596093041
Epoch    88: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  89


0.14092084402973587 0.12136492346014295
Epoch:  90


0.14140029795266487 0.1211803097810064
Epoch:  91


0.14102673127844528 0.12135417759418488
Epoch:  92


0.14085068150952057 0.12131634673901967
Epoch:  93


0.14132919343742165 0.12144179003579277
Epoch:  94


0.1418128802969649 0.12142499962023326
Epoch:  95


0.14119394486014908 0.12117443340165275
Epoch:  96


0.1409871654736029 0.12149342255932945
Epoch:  97


0.14099053998251218 0.12154633338962283
Epoch:  98


0.1414721229591885 0.12142313697508403
Epoch:  99


0.14048367094349218 0.12118949847561973
