In [1]:
# Parameters
until_x = 16


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]


def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()


class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 96, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 96, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 96, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
# Instantiate the model
model = Task5Model(31).to(device)

In [12]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [13]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 15:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6560975289344788 0.6297950029373169
Epoch:  1


0.4412185752391815 0.3143851101398468
Epoch:  2


0.23282768785953523 0.19712331295013427
Epoch:  3


0.18847896099090578 0.19449265897274018
Epoch:  4


0.18146521508693694 0.16782787144184114
Epoch:  5


0.17850462436676026 0.18522151708602905
Epoch:  6


0.17708697736263276 0.18399662971496583
Epoch:  7


0.17552177369594574 0.17466894388198853
Epoch:  8


0.17479762434959412 0.15460993647575377
Epoch:  9


0.1735765242576599 0.1706139475107193
Epoch:  10


0.1708746701478958 0.17796797156333924
Epoch:  11


0.17044443607330323 0.1561892032623291
Epoch:  12


0.1702052026987076 0.15573412477970122
Epoch:  13


0.16723655819892883 0.15794740617275238
Epoch:  14


0.16588938355445862 0.17062893211841584
Epoch    14: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  15


0.16520608484745025 0.1470714569091797
Epoch:  16


0.1623122888803482 0.14613014757633208
Epoch:  17


0.1635418117046356 0.14427826702594757
Epoch:  18


0.16380039751529693 0.14511988461017608
Epoch:  19


0.1635438859462738 0.14406073689460755
Epoch:  20


0.1626653963327408 0.14378919303417206
Epoch:  21


0.16282196938991547 0.14367855191230774
Epoch:  22


0.1637500196695328 0.14285314083099365
Epoch:  23


0.16359178721904755 0.1431365430355072
Epoch:  24


0.16252641379833221 0.14300524890422822
Epoch:  25


0.16268851935863496 0.14274137318134308
Epoch:  26


0.16321045637130738 0.14144776463508607
Epoch:  27


0.16137439727783204 0.14129977822303771
Epoch:  28


0.16098426759243012 0.1422731101512909
Epoch:  29


0.16195698142051695 0.14113753736019136
Epoch:  30


0.16084987938404083 0.1426514834165573
Epoch:  31


0.16224482476711274 0.1402804285287857
Epoch:  32


0.16147976279258727 0.14017179906368255
Epoch:  33


0.1609448504447937 0.14003827869892121
Epoch:  34


0.16115243673324586 0.13826013803482057
Epoch:  35


0.16214608371257783 0.14058474004268645
Epoch:  36


0.16086752593517303 0.139088237285614
Epoch:  37


0.16141029357910155 0.13955227732658387
Epoch:  38


0.16077641665935516 0.13923372626304625
Epoch:  39


0.1607452666759491 0.14024514555931092
Epoch:  40


0.16072264552116394 0.13687388151884078
Epoch:  41


0.1603986495733261 0.13779098391532899
Epoch:  42


0.16015786528587342 0.1384838879108429
Epoch:  43


0.1604411244392395 0.13756951987743377
Epoch:  44


0.16057476580142974 0.1365949898958206
Epoch:  45


0.159254989027977 0.136471489071846
Epoch:  46


0.16058909714221956 0.13701556473970414
Epoch:  47


0.1600024914741516 0.13734745234251022
Epoch:  48


0.15923323214054108 0.13539289683103561
Epoch:  49


0.1593913608789444 0.1364779770374298
Epoch:  50


0.1578104442358017 0.13495955169200896
Epoch:  51


0.159589746594429 0.1361492931842804
Epoch:  52


0.158570756316185 0.13449207395315171
Epoch:  53


0.15918072402477265 0.13468897193670273
Epoch:  54


0.15942831516265868 0.13541620671749116
Epoch:  55


0.158219535946846 0.13356161415576934
Epoch:  56


0.1579710626602173 0.1342471033334732
Epoch:  57


0.15692796111106871 0.13386068344116211
Epoch:  58


0.15730728089809418 0.1331954225897789
Epoch:  59


0.1580379229784012 0.13316056579351426
Epoch:  60


0.15726010084152223 0.13353291302919387
Epoch:  61


0.1570662796497345 0.1342912197113037
Epoch:  62


0.15749404549598695 0.13336091488599777
Epoch:  63


0.1576952278614044 0.1320314735174179
Epoch:  64


0.15816606998443603 0.13317207545042037
Epoch:  65


0.15758903741836547 0.13325745463371277
Epoch:  66


0.15671428322792053 0.13375889956951142
Epoch:  67


0.15684689819812775 0.13272693902254104
Epoch:  68


0.15723816752433778 0.13223954290151596
Epoch:  69


0.15629835724830626 0.1334873393177986
Epoch    69: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  70


0.15630550920963288 0.13198536187410354
Epoch:  71


0.15666367053985597 0.1313524603843689
Epoch:  72


0.15649840772151946 0.13116552233695983
Epoch:  73


0.15540282309055328 0.13089639991521834
Epoch:  74


0.15690652906894684 0.1311912417411804
Epoch:  75


0.15649473369121553 0.1314787283539772
Epoch:  76


0.15533622860908508 0.13101306259632112
Epoch:  77


0.15628064215183257 0.13079431056976318
Epoch:  78


0.15523598968982696 0.13090264648199082
Epoch:  79


0.15704080522060393 0.13135166913270951
Epoch:  80


0.1548744523525238 0.13131712526082992
Epoch:  81


0.15570680379867555 0.1311594232916832
Epoch:  82


0.15499960899353027 0.13113412857055665
Epoch:  83


0.15673680245876312 0.13101127743721008
Epoch    83: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  84


0.1568961399793625 0.130901700258255
Epoch:  85


0.15570335149765013 0.13108640313148498
Epoch:  86


0.15535182893276214 0.13130382150411607
Epoch:  87


0.15610536873340608 0.13069396913051606
Epoch:  88


0.15592992782592774 0.13054622411727906
Epoch:  89


0.15618594884872436 0.13118189424276352
Epoch:  90


0.15598358809947968 0.13120633959770203
Epoch:  91


0.1565346336364746 0.13081878870725633
Epoch:  92


0.15568435072898865 0.13072258234024048
Epoch:  93


0.1554033064842224 0.1311298355460167
Epoch:  94


0.15758196651935577 0.13098485320806502
Epoch    94: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  95


0.15578145980834962 0.1308947116136551
Epoch:  96


0.1550251615047455 0.13105497360229493
Epoch:  97


0.1568107169866562 0.1306564539670944
Epoch:  98


0.15628524422645568 0.13100335747003555
Epoch:  99


0.15773591339588167 0.1313306510448456
