In [1]:
# Parameters
until_x = 15


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]


def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()


class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 96, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 96, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 96, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
# Instantiate the model
model = Task5Model(31).to(device)

In [12]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [13]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 15:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6550556111335755 0.5869999766349793
Epoch:  1


0.444218715429306 0.286295485496521
Epoch:  2


0.2355368560552597 0.19906940162181855
Epoch:  3


0.18840612947940827 0.17619036436080932
Epoch:  4


0.18238052368164062 0.1751379817724228
Epoch:  5


0.1803999000787735 0.5147569715976715
Epoch:  6


0.17834421634674072 0.17035117745399475
Epoch:  7


0.17874594926834106 0.16253333687782287
Epoch:  8


0.17709487318992614 0.17343420088291167
Epoch:  9


0.17617089807987213 0.16142005920410157
Epoch:  10


0.1754740858078003 0.16734130680561066
Epoch:  11


0.17429473757743835 0.16150318682193757
Epoch:  12


0.17302814602851868 0.15876488089561464
Epoch:  13


0.17279582619667053 0.1584412395954132
Epoch:  14


0.17227415025234222 0.15673872828483582
Epoch:  15


0.17108115494251253 0.15767802298069
Epoch:  16


0.16958865642547608 0.15722126960754396
Epoch:  17


0.16987562358379363 0.15117580592632293
Epoch:  18


0.17078534603118897 0.15687435567379
Epoch:  19


0.1670522028207779 0.15103902220726012
Epoch:  20


0.16755573332309723 0.16090647876262665
Epoch:  21


0.16699568450450897 0.1467751145362854
Epoch:  22


0.1647111451625824 0.14527035653591155
Epoch:  23


0.1652908754348755 0.14504560828208923
Epoch:  24


0.16393752932548522 0.1439393162727356
Epoch:  25


0.16397081613540648 0.14511452615261078
Epoch:  26


0.16274801313877105 0.1483340382575989
Epoch:  27


0.16352280497550964 0.14640657007694244
Epoch:  28


0.16240388691425323 0.14259989261627198
Epoch:  29


0.16281619668006897 0.14135486185550689
Epoch:  30


0.16015150010585785 0.14059706181287765
Epoch:  31


0.16114240229129792 0.13904089331626893
Epoch:  32


0.15967290937900544 0.14257192313671113
Epoch:  33


0.16106339633464814 0.14433516263961793
Epoch:  34


0.1594795900583267 0.13646938949823378
Epoch:  35


0.1603846448659897 0.14220738410949707
Epoch:  36


0.15935587406158447 0.1396637737751007
Epoch:  37


0.1583084762096405 0.14376424849033356
Epoch:  38


0.15857636868953706 0.14052113592624665
Epoch:  39


0.15629201292991637 0.13865346908569337
Epoch:  40


0.157936714887619 0.1359566032886505
Epoch:  41


0.15611593544483185 0.14162738919258117
Epoch:  42


0.15610808968544007 0.13587018698453904
Epoch:  43


0.15632813274860383 0.14111341238021852
Epoch:  44


0.15747612535953523 0.1360668957233429
Epoch:  45


0.15577075958251954 0.13199030607938766
Epoch:  46


0.15506964981555937 0.1346582517027855
Epoch:  47


0.15536829054355622 0.1423032283782959
Epoch:  48


0.15577836334705353 0.13818913996219634
Epoch:  49


0.1564112877845764 0.14206449389457704
Epoch:  50


0.15498779594898224 0.13472423553466797
Epoch:  51


0.15505568265914918 0.1310584470629692
Epoch:  52


0.1540858793258667 0.1313883677124977
Epoch:  53


0.15350817680358886 0.14770518541336058
Epoch:  54


0.15319080889225006 0.13214110285043718
Epoch:  55


0.15471030294895172 0.14076015651226043
Epoch:  56


0.154973566532135 0.1307120755314827
Epoch:  57


0.1547157061100006 0.14580101668834686
Epoch:  58


0.1534421533346176 0.13102518320083617
Epoch:  59


0.15314020872116088 0.1310612827539444
Epoch:  60


0.15275072932243347 0.13186316937208176
Epoch:  61


0.1517501002550125 0.13800232857465744
Epoch:  62


0.15230247735977173 0.13467466980218887
Epoch    62: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  63


0.15157224714756012 0.1265041172504425
Epoch:  64


0.150166876912117 0.12614311873912812
Epoch:  65


0.1499348211288452 0.12592779099941254
Epoch:  66


0.14839258551597595 0.12584008872509003
Epoch:  67


0.1508983701467514 0.12570113241672515
Epoch:  68


0.15080344796180725 0.1251526802778244
Epoch:  69


0.14873468697071077 0.1253503069281578
Epoch:  70


0.1500337564945221 0.12604270577430726
Epoch:  71


0.1497017824649811 0.1252389132976532
Epoch:  72


0.14927159428596495 0.12542959302663803
Epoch:  73


0.14953287005424498 0.1255842223763466
Epoch:  74


0.14932915925979615 0.12519460320472717
Epoch    74: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  75


0.1497218257188797 0.12521470934152604
Epoch:  76


0.14769111156463624 0.1252966746687889
Epoch:  77


0.14840566337108613 0.1251058831810951
Epoch:  78


0.14856432020664215 0.1251928836107254
Epoch:  79


0.14868506491184236 0.1250568300485611
Epoch:  80


0.14905371367931367 0.12487314045429229
Epoch:  81


0.14899802803993226 0.12503194212913513
Epoch:  82


0.14819430232048034 0.1250985622406006
Epoch:  83


0.14915349543094636 0.12477730065584183
Epoch:  84


0.14827511191368103 0.1250227004289627
Epoch:  85


0.14846921980381012 0.1248783990740776
Epoch:  86


0.14800243079662323 0.12517262399196624
Epoch:  87


0.14888040840625763 0.1250287964940071
Epoch:  88


0.14849087953567505 0.12483466565608978
Epoch:  89


0.15029819130897523 0.12493345886468887
Epoch    89: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  90


0.1484017127752304 0.12505025416612625
Epoch:  91


0.1471996170282364 0.12483837008476258
Epoch:  92


0.14768488764762877 0.12476620376110077
Epoch:  93


0.1485350376367569 0.1249268740415573
Epoch:  94


0.14962165653705597 0.12516676485538483
Epoch:  95


0.14805747389793397 0.1250947058200836
Epoch    95: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  96


0.14822797179222108 0.12500067055225372
Epoch:  97


0.148159983754158 0.12498299926519393
Epoch:  98


0.14848218917846678 0.12480638027191163
Epoch:  99


0.14823972404003144 0.1248590186238289
