In [1]:
# Parameters
until_x = 1


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]


def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()


class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 96, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 96, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 96, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
# Instantiate the model
model = Task5Model(31).to(device)

In [12]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [13]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 15:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6489894247055054 0.5720526576042175
Epoch:  1


0.4296478235721588 0.2715633809566498
Epoch:  2


0.22772520244121552 0.17947062849998474
Epoch:  3


0.18552123069763182 0.17197493612766265
Epoch:  4


0.17680976808071136 0.18789510428905487
Epoch:  5


0.1757171607017517 0.16708610951900482
Epoch:  6


0.17364707887172698 0.165547177195549
Epoch:  7


0.171232128739357 0.1689281702041626
Epoch:  8


0.1700538569688797 0.1599130302667618
Epoch:  9


0.1683540564775467 0.1693756252527237
Epoch:  10


0.1673583847284317 0.1522935152053833
Epoch:  11


0.16717419564723968 0.14745262563228606
Epoch:  12


0.16502814173698424 0.14599752724170684
Epoch:  13


0.16401835799217224 0.14605234265327455
Epoch:  14


0.16319584965705872 0.15900597274303435
Epoch:  15


0.1614484131336212 0.14401543140411377
Epoch:  16


0.161279274225235 0.13970788717269897
Epoch:  17


0.1607289171218872 0.13676985651254653
Epoch:  18


0.16103184342384338 0.14047250747680665
Epoch:  19


0.16032703518867492 0.13449939489364623
Epoch:  20


0.15933684825897218 0.13987676203250884
Epoch:  21


0.1593652194738388 0.15619206428527832
Epoch:  22


0.15687362968921661 0.13928310573101044
Epoch:  23


0.1578880000114441 0.1320806235074997
Epoch:  24


0.1585401052236557 0.1325435996055603
Epoch:  25


0.15824542939662933 0.13263405859470367
Epoch:  26


0.15715806186199188 0.13838013410568237
Epoch:  27


0.15608572721481323 0.13196926265954972
Epoch:  28


0.15664737403392792 0.1303403005003929
Epoch:  29


0.15585252940654754 0.13881980180740355
Epoch:  30


0.15552015125751495 0.13679813593626022
Epoch:  31


0.15563742637634279 0.13638291656970977
Epoch:  32


0.15409413814544679 0.13573411405086516
Epoch:  33


0.15689437389373778 0.13509840965270997
Epoch:  34


0.15420886516571045 0.145575413107872
Epoch    34: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  35


0.15423185884952545 0.12799694687128066
Epoch:  36


0.15373129308223724 0.1278686821460724
Epoch:  37


0.1533648133277893 0.12723840028047562
Epoch:  38


0.15163076281547547 0.12680411487817764
Epoch:  39


0.1528064626455307 0.12752795666456224
Epoch:  40


0.1525387805700302 0.12691202759742737
Epoch:  41


0.15181885063648223 0.12726743668317794
Epoch:  42


0.15215690195560455 0.12685247361660004
Epoch:  43


0.15192690670490264 0.12730568200349807
Epoch:  44


0.15103300750255585 0.12714900970458984
Epoch    44: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  45


0.15068814694881438 0.1266968861222267
Epoch:  46


0.1503717839717865 0.1268640160560608
Epoch:  47


0.1520116239786148 0.12652883529663086
Epoch:  48


0.15145110845565796 0.12627285420894624
Epoch:  49


0.15136381208896638 0.1266998365521431
Epoch:  50


0.15180091381073 0.12647010535001754
Epoch:  51


0.15162917852401733 0.12655202448368072
Epoch:  52


0.15204802334308623 0.12639233618974685
Epoch:  53


0.1514504635334015 0.12643300741910934
Epoch:  54


0.15212657034397126 0.12612471580505372
Epoch:  55


0.1514825189113617 0.12629021555185319
Epoch:  56


0.15170234322547912 0.12652243226766585
Epoch:  57


0.15161075532436372 0.12631494849920272
Epoch:  58


0.15089996814727782 0.12662922590970993
Epoch:  59


0.15281920731067658 0.12623369097709655
Epoch:  60


0.15233078062534333 0.12631100863218309
Epoch    60: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  61


0.15075381338596344 0.1265715941786766
Epoch:  62


0.15070194661617278 0.12631111294031144
Epoch:  63


0.1518961036205292 0.1261897549033165
Epoch:  64


0.15131975293159486 0.12666709423065187
Epoch:  65


0.15089443802833558 0.1265757218003273
Epoch:  66


0.1513318169116974 0.12658334225416185
Epoch    66: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  67


0.15184075772762298 0.1266726791858673
Epoch:  68


0.1511041498184204 0.12645985931158066
Epoch:  69
