In [1]:
# Parameters
until_x = 17


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]


def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()


class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 96, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 96, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 96, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
# Instantiate the model
model = Task5Model(31).to(device)

In [12]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [13]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 15:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6516565179824829 0.6672799468040467
Epoch:  1


0.4312399673461914 0.29291815757751466
Epoch:  2


0.23021785140037537 0.18589355945587158
Epoch:  3


0.18933925688266753 0.1803312122821808
Epoch:  4


0.18293803811073303 0.16598188877105713
Epoch:  5


0.1790076607465744 0.1664512813091278
Epoch:  6


0.17902911901474 0.17297112345695495
Epoch:  7


0.17677468955516815 0.1615406960248947
Epoch:  8


0.17626385629177094 0.16877431571483612
Epoch:  9


0.1739164900779724 0.16192878782749176
Epoch:  10


0.1736091762781143 0.16506998538970946
Epoch:  11


0.1737418633699417 0.8843846201896668
Epoch:  12


0.17322859644889832 0.16933578848838807
Epoch:  13


0.17186494290828705 0.16031500697135925
Epoch:  14


0.17202531278133393 0.1595684379339218
Epoch:  15


0.17120808303356172 0.16000906527042388
Epoch:  16


0.17183995962142945 0.15281476080417633
Epoch:  17


0.17068984270095824 0.15126369893550873
Epoch:  18


0.1699280858039856 0.1551258772611618
Epoch:  19


0.17018908739089966 0.17420855164527893
Epoch:  20


0.1691812425851822 0.14754674434661866
Epoch:  21


0.16707943379878998 0.1490993469953537
Epoch:  22


0.16763619542121888 0.15132131278514863
Epoch:  23


0.16674548506736755 0.15296690464019774
Epoch:  24


0.16702865600585937 0.1487545281648636
Epoch:  25


0.1650189006328583 0.14759005308151246
Epoch:  26


0.16422832727432252 0.15082938075065613
Epoch    26: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  27


0.1646816152334213 0.14220994412899018
Epoch:  28


0.1635361832380295 0.14152538180351257
Epoch:  29


0.1634868347644806 0.1405959963798523
Epoch:  30


0.16199497759342193 0.14073630571365356
Epoch:  31


0.1625322949886322 0.14039029479026793
Epoch:  32


0.16137019217014312 0.14066495895385742
Epoch:  33


0.1615407258272171 0.14034690260887145
Epoch:  34


0.16231490731239318 0.13971685469150544
Epoch:  35


0.16301092743873596 0.13973985612392426
Epoch:  36


0.16117610454559325 0.13970888257026673
Epoch:  37


0.16203074634075165 0.14117491543292998
Epoch:  38


0.16071299731731414 0.13915874063968658
Epoch:  39


0.16089588403701782 0.13995009958744048
Epoch:  40


0.16040797472000123 0.13928678929805755
Epoch:  41


0.16019586026668547 0.14625259041786193
Epoch:  42


0.16279079377651215 0.13944964706897736
Epoch:  43


0.16108007192611695 0.13899512887001036
Epoch:  44


0.16036885738372803 0.13877954781055452
Epoch:  45


0.16120970368385315 0.13938502073287964
Epoch:  46


0.1610042804479599 0.13845776617527009
Epoch:  47


0.16099408030509949 0.13855283856391906
Epoch:  48


0.16035122632980348 0.1376028761267662
Epoch:  49


0.16066213190555573 0.13872527480125427
Epoch:  50


0.1604445105791092 0.13788994699716567
Epoch:  51


0.16009533286094665 0.1374215990304947
Epoch:  52


0.1612776255607605 0.13726864606142045
Epoch:  53


0.16020553827285766 0.13779034614562988
Epoch:  54


0.16169297337532043 0.1366847962141037
Epoch:  55


0.16109555780887605 0.13802171647548675
Epoch:  56


0.15996274828910828 0.13845290541648864
Epoch:  57


0.16031421482563019 0.13673669397830962
Epoch:  58


0.15910627961158752 0.13827879428863527
Epoch:  59


0.1600336390733719 0.13701408952474595
Epoch:  60


0.15919476747512817 0.13720586597919465
Epoch    60: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  61


0.15939586102962494 0.13682182878255844
Epoch:  62


0.16076990008354186 0.13672395646572114
Epoch:  63


0.15948351323604584 0.13618920147418975
Epoch:  64


0.15935729205608368 0.13626639991998674
Epoch:  65


0.1573786622285843 0.13665837198495864
Epoch:  66


0.1596325832605362 0.13647595494985582
Epoch:  67


0.15787784695625307 0.13638421297073364
Epoch:  68


0.15942548096179962 0.1366203859448433
Epoch:  69


0.15993067622184753 0.1366836056113243
Epoch    69: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  70


0.15939262509346008 0.13653322905302048
Epoch:  71


0.1583003121614456 0.1365234687924385
Epoch:  72


0.15899104058742522 0.1364775463938713
Epoch:  73


0.1602531450986862 0.1364331364631653
Epoch:  74


0.1590472024679184 0.13625125586986542
Epoch:  75


0.16075328409671782 0.1369232341647148
Epoch    75: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  76


0.1595356982946396 0.13666609078645706
Epoch:  77


0.15972568929195405 0.1365098163485527
Epoch:  78
