In [1]:
# Parameters
until_x = 15


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 96, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 96, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 96, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [12]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [13]:
# Instantiate the model
model = Task5Model(31).to(device)

In [14]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6564452075958251 0.5581199526786804
Epoch:  1


0.44025668144226077 0.2852416694164276
Epoch:  2


0.23174314320087433 0.18745485544204712
Epoch:  3


0.1853332805633545 0.16940159499645233
Epoch:  4


0.17614602148532868 0.16585945785045625
Epoch:  5


0.17195101022720338 0.1570933371782303
Epoch:  6


0.1661398822069168 0.1441195160150528
Epoch:  7


0.16198508977890014 0.15155563354492188
Epoch:  8


0.1587488520145416 0.1414591431617737
Epoch:  9


0.1574672454595566 0.13933906853199005
Epoch:  10


0.15709045886993409 0.13532189130783082
Epoch:  11


0.15478389084339142 0.1343149095773697
Epoch:  12


0.15379953384399414 0.13162679672241212
Epoch:  13


0.1531067156791687 0.12960563451051713
Epoch:  14


0.15311632573604583 0.13111613988876342
Epoch:  15


0.15286378920078278 0.128971566259861
Epoch:  16


0.15290819764137267 0.12737710177898406
Epoch:  17


0.15102763772010802 0.23349905908107757
Epoch:  18


0.15259870231151582 0.13391075432300567
Epoch:  19


0.1505396068096161 0.1273389518260956
Epoch:  20


0.15086459934711458 0.12739102691411971
Epoch:  21


0.14957997500896453 0.1316737025976181
Epoch:  22


0.14965664088726044 0.1309351444244385
Epoch:  23


0.14923249661922455 0.13019734174013137
Epoch:  24


0.15013642966747284 0.12825121283531188
Epoch:  25


0.14927144289016725 0.13467285931110382
Epoch    25: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  26


0.14608486175537108 0.12352975457906723
Epoch:  27


0.14576980471611023 0.12393403202295303
Epoch:  28


0.14661798238754273 0.12454126477241516
Epoch:  29


0.14611902236938476 0.12333807200193406
Epoch:  30


0.1444278007745743 0.12303628474473953
Epoch:  31


0.14451866924762727 0.12277914881706238
Epoch:  32


0.14412283658981323 0.12282641530036927
Epoch:  33


0.14479143500328065 0.12259203940629959
Epoch:  34


0.14399354994297028 0.12326778322458268
Epoch:  35


0.14431554555892945 0.12347114086151123
Epoch:  36


0.1443569666147232 0.12283232063055038
Epoch:  37


0.14443445086479187 0.12329246252775192
Epoch:  38


0.14348836362361908 0.12269651293754577
Epoch:  39


0.14353301167488097 0.12281745076179504
Epoch    39: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  40


0.1432807058095932 0.12247338443994522
Epoch:  41


0.14332347691059114 0.12266038805246353
Epoch:  42


0.1439572322368622 0.12242476791143417
Epoch:  43


0.1429147493839264 0.12252413630485534
Epoch:  44


0.14344919800758363 0.12222243845462799
Epoch:  45


0.1438560175895691 0.12253777831792831
Epoch:  46


0.14324066936969757 0.1224992886185646
Epoch:  47


0.1432688421010971 0.12242473810911178
Epoch:  48


0.14433666110038756 0.12244372665882111
Epoch:  49


0.14351682126522064 0.12274406254291534
Epoch:  50


0.1420600777864456 0.12270556837320327
Epoch    50: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  51


0.14424376130104066 0.12233799397945404
Epoch:  52


0.14468765139579773 0.12245388627052307
Epoch:  53


0.14391543507575988 0.1225020632147789
Epoch:  54


0.14427772641181946 0.12254464775323867
Epoch:  55


0.14236481070518495 0.1225854590535164
Epoch:  56


0.14233509302139283 0.12250276505947114
Epoch    56: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  57


0.1443253666162491 0.12283555567264556
Epoch:  58


0.14285583436489105 0.12266493737697601
Epoch:  59


0.1443869948387146 0.1226981520652771
Epoch:  60


0.14434558391571045 0.12260215282440186
Epoch:  61


0.14260613262653352 0.12231182008981704
Epoch:  62


0.14385069131851197 0.12240555435419083
Epoch    62: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  63


0.1437619560956955 0.122392076253891
Epoch:  64


0.1438641679286957 0.12256511002779007
Epoch:  65


0.14394259810447693 0.12232578098773957
Epoch:  66


0.14398300170898437 0.12268430292606354
Epoch:  67


0.14318494856357575 0.12247035354375839
Epoch:  68


0.14326138079166412 0.12247539460659027
Epoch:  69
