In [1]:
# Parameters
until_x = 18


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]


def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()


class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 96, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 96, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 96, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
# Instantiate the model
model = Task5Model(31).to(device)

In [12]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [13]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 15:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6562768173217773 0.6161645174026489
Epoch:  1


0.4370847237110138 0.2806579232215881
Epoch:  2


0.23093206226825713 0.1892976224422455
Epoch:  3


0.18625712990760804 0.1763742595911026
Epoch:  4


0.17888176560401917 0.16595175564289094
Epoch:  5


0.1768622052669525 0.17395542562007904
Epoch:  6


0.17546755731105804 0.1605554461479187
Epoch:  7


0.1739413034915924 0.15805999040603638
Epoch:  8


0.17329422533512115 0.16180354952812195
Epoch:  9


0.17072989881038667 0.1595110148191452
Epoch:  10


0.17101286232471466 0.16476224064826966
Epoch:  11


0.17028816282749176 0.1573818027973175
Epoch:  12


0.1679885232448578 0.15790770947933197
Epoch:  13


0.1688559252023697 0.15316179096698762
Epoch:  14


0.16768935441970825 0.1597711145877838
Epoch:  15


0.16681672513484955 0.14889006316661835
Epoch:  16


0.16512403070926665 0.1469319760799408
Epoch:  17


0.16445660710334778 0.1466627836227417
Epoch:  18


0.16401737153530122 0.1426139384508133
Epoch:  19


0.16517365872859954 0.14408724308013915
Epoch:  20


0.16302334368228913 0.14252055585384368
Epoch:  21


0.1626340341567993 0.1408834308385849
Epoch:  22


0.1615436589717865 0.1474404215812683
Epoch:  23


0.16026464879512786 0.14291455447673798
Epoch:  24


0.16092090547084809 0.1401343375444412
Epoch:  25


0.16067736864089965 0.13885963559150696
Epoch:  26


0.1601550978422165 0.13746855407953262
Epoch:  27


0.15955016314983367 0.1409740224480629
Epoch:  28


0.15905398905277252 0.136400406062603
Epoch:  29


0.15919257402420045 0.1377938136458397
Epoch:  30


0.156842879652977 0.13536951690912247
Epoch:  31


0.15820756554603577 0.14270743131637573
Epoch:  32


0.15772614419460296 0.1400398224592209
Epoch:  33


0.15731013119220733 0.13691619634628296
Epoch:  34


0.15667508780956269 0.13803136497735977
Epoch:  35


0.15588893592357636 0.13319190889596938
Epoch:  36


0.15561381578445435 0.13646795600652695
Epoch:  37


0.15524866163730622 0.13338766545057296
Epoch:  38


0.15641310691833496 0.1365352302789688
Epoch:  39


0.15662606179714203 0.13905747532844542
Epoch:  40


0.15574761033058165 0.13450322300195694
Epoch:  41


0.15600634336471558 0.13377696126699448
Epoch    41: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  42


0.1554808008670807 0.12916690558195115
Epoch:  43


0.15187424659729004 0.12921957522630692
Epoch:  44


0.15312544941902162 0.12843959778547287
Epoch:  45


0.1535011625289917 0.12849744856357576
Epoch:  46


0.1521363765001297 0.1289546012878418
Epoch:  47


0.1531101667881012 0.12844092398881912
Epoch:  48


0.15315363824367523 0.12864254117012025
Epoch:  49


0.15219644129276275 0.12862122654914857
Epoch:  50


0.15156426250934601 0.12852822691202165
Epoch    50: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  51


0.1531256604194641 0.12813535034656526
Epoch:  52


0.15238796710968017 0.1278412476181984
Epoch:  53


0.15284623324871063 0.12807199656963347
Epoch:  54


0.15233181536197662 0.1280707150697708
Epoch:  55


0.1518877911567688 0.12817275077104567
Epoch:  56


0.1522700482606888 0.1279693976044655
Epoch:  57


0.15199259042739868 0.1277916759252548
Epoch:  58


0.15121935665607453 0.12775323688983917
Epoch:  59


0.15341015994548798 0.12736887037754058
Epoch:  60


0.15195684731006623 0.1276797577738762
Epoch:  61


0.1517563706636429 0.12784067094326018
Epoch:  62


0.151653453707695 0.127472922205925
Epoch:  63


0.15226879000663757 0.127527591586113
Epoch:  64


0.1527747941017151 0.12729474753141404
Epoch:  65


0.15284866631031035 0.12750039249658585
Epoch:  66


0.15274294555187226 0.12757018208503723
Epoch:  67


0.15233131110668183 0.1276438355445862
Epoch:  68


0.1510656952857971 0.12779653668403626
Epoch:  69


0.15218459606170653 0.1277070462703705
Epoch:  70


0.152321937084198 0.1275744378566742
Epoch    70: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  71


0.1512747412919998 0.12772440314292907
Epoch:  72


0.15266298234462738 0.12767735123634338
Epoch:  73


0.15215718865394592 0.12775457203388213
Epoch:  74


0.15093603610992432 0.12755753397941588
Epoch:  75


0.15101199984550476 0.12740440368652345
Epoch:  76


0.15087835967540741 0.1275704711675644
Epoch    76: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  77


0.15110994517803192 0.1276913031935692
Epoch:  78


0.15271204829216004 0.12755495607852935
Epoch:  79
