In [1]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [2]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [3]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [4]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [5]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [6]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [7]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [8]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [9]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [10]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [11]:
until_x = None

In [12]:
# Parameters
until_x = 3


In [13]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [14]:
# Instantiate the model
model = Task5Model(31).to(device)

In [15]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [16]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6331690968693914 0.5323441880089896
Epoch:  1


0.33448610515207855 0.2073485553264618
Epoch:  2


0.1877749115228653 0.16523653907435282
Epoch:  3


0.17398841679096222 0.15211742903505052
Epoch:  4


0.17006518993828748 0.16510988559041703
Epoch:  5


0.16701521382138534 0.14686200767755508
Epoch:  6


0.16573731883152112 0.14092401415109634
Epoch:  7


0.16415021266486193 0.1417047796504838
Epoch:  8


0.16212771148295016 0.14090978779963084
Epoch:  9


0.16102756519575376 0.1450000639472689
Epoch:  10


0.15955602075602557 0.1348505967429706
Epoch:  11


0.15887663533558716 0.136941773550851
Epoch:  12


0.15878644303695574 0.15284630869116103
Epoch:  13


0.15837258462970322 0.13255693124873297
Epoch:  14


0.15698915639439145 0.13424716464110784
Epoch:  15


0.15569363413630305 0.13535865928445542
Epoch:  16


0.15682493271054448 0.13156310256038392
Epoch:  17


0.15637181981189832 0.13651444230760848
Epoch:  18


0.15603889726303719 0.13912472235304968
Epoch:  19


0.1555971160933778 0.13021509562219893
Epoch:  20


0.15466002556117805 0.13079773847545897
Epoch:  21


0.15393376592043284 0.13293121755123138
Epoch:  22


0.15327532146428083 0.13099785255534308
Epoch:  23


0.15386972435422847 0.1283903962799481
Epoch:  24


0.1532518054182465 0.1285746842622757
Epoch:  25


0.15248067838114662 0.12853773470435823
Epoch:  26


0.15187934927038244 0.12887039354869298
Epoch:  27


0.15236441348050092 0.12699180096387863
Epoch:  28


0.15208853740949888 0.12977820209094457
Epoch:  29


0.1519483641998188 0.1284776127764157
Epoch:  30


0.15242699110830152 0.1354343369603157
Epoch:  31


0.15014330962219755 0.1309695371559688
Epoch:  32


0.1512639409786946 0.13305597539458955
Epoch:  33


0.1510428697676272 0.12707104427473886
Epoch    33: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  34


0.15069183707237244 0.1241155628647123
Epoch:  35


0.14805920260983543 0.12368409761360713
Epoch:  36


0.1469782965408789 0.12354374038321632
Epoch:  37


0.147595950075098 0.12326181360653468
Epoch:  38


0.14782123146830378 0.12330812960863113
Epoch:  39


0.14735773286303958 0.12285877232040678
Epoch:  40


0.14672608593025724 0.12318952062300273
Epoch:  41


0.1449894168086954 0.12292720058134624
Epoch:  42


0.14586371143121976 0.12293166454349246
Epoch:  43


0.14622171746717916 0.12289440738303321
Epoch:  44


0.14581083204295184 0.12301447774682726
Epoch:  45


0.1465946568830593 0.12405915558338165
Epoch    45: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  46


0.14505931130937627 0.12370840247188296
Epoch:  47


0.14390160143375397 0.12302137698446002
Epoch:  48


0.1456789185066481 0.12298590902771268
Epoch:  49


0.14492802362184268 0.12273194321564265
Epoch:  50


0.14510939612581925 0.12276794229234968
Epoch:  51


0.1453679691295366 0.12277587077447347
Epoch:  52


0.14485315897980253 0.1228124669619969
Epoch:  53


0.14509410511802984 0.12242893342460905
Epoch:  54


0.14550895425113472 0.12286877632141113
Epoch:  55


0.14532143199766004 0.12289424985647202
Epoch:  56


0.14581383523103353 0.12273461584533964
Epoch:  57


0.14476659007974574 0.12260303007704872
Epoch:  58


0.14577610831002932 0.12258819703544889
Epoch:  59


0.14478045057606054 0.12261090853384563
Epoch    59: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  60


0.14558374801197568 0.1225727189864431
Epoch:  61


0.14593026766905914 0.12237994905029025
Epoch:  62


0.14526284949199572 0.12259942186730248
Epoch:  63


0.14289322294093468 0.12262589378016335
Epoch:  64


0.1448157484064231 0.12255162958587919
Epoch:  65


0.14578204904053663 0.1225545225398881
Epoch:  66


0.14414112149058161 0.12251076421567372
Epoch:  67


0.1442990850757908 0.12252921611070633
Epoch    67: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  68


0.1444746159218453 0.12263648424829755
Epoch:  69


0.14500684351534457 0.12237616309097835
Epoch:  70


0.1455504648588799 0.12266059645584651
Epoch:  71


0.14472482091671712 0.12264439570052284
Epoch:  72


0.14525625270766182 0.12246258237532207
Epoch:  73


0.14411189789707596 0.12272225533212934
Epoch    73: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  74


0.14583929406630025 0.12235215732029506
Epoch:  75


0.1450950652360916 0.12257372800792966
Epoch:  76


0.1442752013335357 0.12233910070998329
Epoch:  77


0.14481062301107356 0.12240419643265861
Epoch:  78


0.14562204117710525 0.1225244722196034
Epoch:  79


0.1455016655696405 0.122430862060615
Epoch:  80


0.14552209063156232 0.12244933098554611
Epoch:  81


0.1457163184075742 0.12234306761196681
Epoch:  82


0.14502816989615158 0.12255184458834785
Epoch:  83


0.14490355551242828 0.12285450207335609
Epoch:  84


0.1446431315428502 0.12234692914145333
Epoch:  85


0.14472904801368713 0.12245513924530574
Epoch:  86


0.14455831332786664 0.12244356104305812
Epoch:  87


0.14527183972500465 0.1226416751742363
Epoch:  88


0.14448132667992566 0.12235076193298612
Epoch:  89


0.14487851672881358 0.12256736201899392
Epoch:  90


0.1455795990454184 0.12263742195708412
Epoch:  91


0.1455383091359525 0.12239853718451091
Epoch:  92


0.14613004834265322 0.12221345518316541
Epoch:  93


0.14555995730129448 0.12278038795505251
Epoch:  94


0.144111711431194 0.12230596159185682
Epoch:  95


0.14600337639048294 0.12249324470758438
Epoch:  96


0.1446061363896808 0.12235727586916514
Epoch:  97


0.14452511837353577 0.1223718768783978
Epoch:  98


0.1441697522595122 0.1225477893437658
Epoch:  99


0.14543923856438817 0.12259917386940547
