In [1]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [2]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [3]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [4]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [5]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [6]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [7]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [8]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [9]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [10]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [11]:
until_x = None

In [12]:
# Parameters
until_x = 18


In [13]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [14]:
# Instantiate the model
model = Task5Model(31).to(device)

In [15]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [16]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.632585622168876 0.45391171744891573
Epoch:  1


0.33035989387615305 0.2108511200972966
Epoch:  2


0.19410489016288035 0.18584948778152466
Epoch:  3


0.17952418367604953 0.1848430016211101
Epoch:  4


0.1770321493213241 0.15880420165402548
Epoch:  5


0.17518012870002436 0.1677812933921814
Epoch:  6


0.1734035937367259 0.1611397521836417
Epoch:  7


0.17189805572097366 0.15789334901741572
Epoch:  8


0.171398900650643 0.15854323974677495
Epoch:  9


0.1695495461289947 0.15356094070843287
Epoch:  10


0.16838143000731598 0.1546617852790015
Epoch:  11


0.16737502813339233 0.15612312512738363
Epoch:  12


0.1671583447101954 0.15365496916430338
Epoch:  13


0.16794524643872236 0.15096611636025564
Epoch:  14


0.16854664401428118 0.15127358266285487
Epoch:  15


0.16649768280016408 0.1465184869510787
Epoch:  16


0.16591137969816053 0.14931018544094904
Epoch:  17


0.1648484259038358 0.14762201692376817
Epoch:  18


0.16519814649143735 0.14323575901133673
Epoch:  19


0.1644984685085915 0.14089619261877878
Epoch:  20


0.16269097255693898 0.14357806316443852
Epoch:  21


0.16244921490952774 0.13699557100023543
Epoch:  22


0.16180061810725443 0.1398314322744097
Epoch:  23


0.16101856046431773 0.13969629045043672
Epoch:  24


0.16086622226882624 0.1378829585654395
Epoch:  25


0.16039293521159403 0.1380706142101969
Epoch:  26


0.15970178270662153 0.13873046849455153
Epoch:  27


0.158829656807152 0.1386834510735103
Epoch    27: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  28


0.15739319533915133 0.13216212072542735
Epoch:  29


0.15791860383910103 0.1321216513003622
Epoch:  30


0.15761459357029683 0.13103802182844707
Epoch:  31


0.15737267482925105 0.13077758359057562
Epoch:  32


0.15732388117828885 0.1314619928598404
Epoch:  33


0.15634310366334142 0.13120546404804503
Epoch:  34


0.15605439004060384 0.13120233161108835
Epoch:  35


0.1561409951867284 0.13091611223561422
Epoch:  36


0.15611561407914032 0.13066055199929646
Epoch:  37


0.15684816603725021 0.13098886821951186
Epoch:  38


0.1562090487899007 0.1315262530531202
Epoch:  39


0.15683341187399788 0.13114000963313238
Epoch:  40


0.15686830113062988 0.13068441514457976
Epoch:  41


0.15655655192362294 0.13047109650714056
Epoch:  42


0.15528416673879367 0.12999373355082103
Epoch:  43


0.15544145816081278 0.13156536434377944
Epoch:  44


0.15517401654978055 0.129965677857399
Epoch:  45


0.15558614803327098 0.12996047735214233
Epoch:  46


0.15632730480786916 0.13002394778387888
Epoch:  47


0.1555435065482114 0.1309081284063203
Epoch:  48


0.15513200655176834 0.13115547065223968
Epoch:  49


0.15355345324890032 0.1305584620152201
Epoch:  50


0.1544643680791597 0.1301186265689986
Epoch    50: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  51


0.15479187667369843 0.12960715698344366
Epoch:  52


0.15506101419796814 0.13018997332879476
Epoch:  53


0.1537585427632203 0.13001598417758942
Epoch:  54


0.15455682535429258 0.13022108376026154
Epoch:  55


0.15436239862764203 0.1303162021296365
Epoch:  56


0.15490547950203354 0.12953066932303564
Epoch:  57


0.15383449357909126 0.13008012090410506
Epoch:  58


0.15438571773670814 0.1301200624023165
Epoch:  59


0.15471592062228434 0.12959712211574828
Epoch:  60


0.1548916290740709 0.13019760165895736
Epoch:  61


0.15464328591888016 0.12948663106986455
Epoch:  62


0.15396876713714083 0.12983221347842896
Epoch:  63


0.15420368998437314 0.12967144485030854
Epoch:  64


0.15495593523657 0.1303551963397435
Epoch:  65


0.15301462323278994 0.1296389326453209
Epoch:  66


0.15484795538154808 0.129263379744121
Epoch:  67


0.15455931304274378 0.13011909701994487
Epoch:  68


0.15403657950259544 0.13039445664201463
Epoch:  69


0.15417671767441002 0.12980906878198897
Epoch:  70


0.15507879007507014 0.1298450572150094
Epoch:  71


0.1549621200239336 0.12983148757900512
Epoch:  72


0.15508014124792976 0.1294534376689366
Epoch    72: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  73


0.15364260568812088 0.12977808926786696
Epoch:  74


0.15449203107808088 0.13018336466380528
Epoch:  75


0.1551916945624996 0.1294666220034872
Epoch:  76


0.15382766924999855 0.13007476393665587
Epoch:  77


0.15324963145964854 0.13009149368320191
Epoch:  78


0.15418148362958753 0.12930102114166533
Epoch    78: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  79


0.15425442723003593 0.13009876970733916
Epoch:  80


0.15433690555997798 0.12966257759502955
Epoch:  81


0.15475831080127406 0.12979845383337565
Epoch:  82


0.1546922817423537 0.1302397165979658
Epoch:  83


0.1544920685323509 0.12957357721669333
Epoch:  84


0.15361774974578135 0.12995504587888718
Epoch    84: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  85


0.1535872277375814 0.12959341066224234
Epoch:  86


0.15489809537256086 0.12969063435282027
Epoch:  87


0.153163670285328 0.12974381659712111
Epoch:  88


0.15498097603385513 0.1296979593379157
Epoch:  89


0.15365609284993764 0.1302158609032631
Epoch:  90


0.15517933666706085 0.1297507562807628
Epoch:  91
