In [1]:
# Parameters
until_x = 8


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]


def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()


class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 96, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 96, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 96, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
# Instantiate the model
model = Task5Model(31).to(device)

In [12]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [13]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 15:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6534232068061828 0.558876895904541
Epoch:  1


0.43489922285079957 0.2904267251491547
Epoch:  2


0.22648949682712555 0.1895519107580185
Epoch:  3


0.18137162685394287 0.16955569982528687
Epoch:  4


0.17692088067531586 0.16309782564640046
Epoch:  5


0.1738186025619507 0.18009729385375978
Epoch:  6


0.1708570623397827 0.16239748299121856
Epoch:  7


0.1671050602197647 0.15077361166477204
Epoch:  8


0.16567228436470033 0.15755980014801024
Epoch:  9


0.1639898681640625 0.15817436575889587
Epoch:  10


0.1638450437784195 0.15305557548999787
Epoch:  11


0.16135983288288117 0.14885089695453643
Epoch:  12


0.16143644332885743 0.14329812824726104
Epoch:  13


0.1608588707447052 0.1838100254535675
Epoch:  14


0.15965061247348786 0.14047938138246535
Epoch:  15


0.16068529486656188 0.18599194586277007
Epoch:  16


0.15765282213687898 0.13586144894361496
Epoch:  17


0.1584901136159897 0.14334672093391418
Epoch:  18


0.15863910853862762 0.15709432661533357
Epoch:  19


0.15777961075305938 0.1694615215063095
Epoch:  20


0.15853477656841278 0.17896791994571687
Epoch:  21


0.1572092866897583 0.23834748566150665
Epoch:  22


0.1558329927921295 0.13271697759628295
Epoch:  23


0.15736717045307158 0.14194383919239045
Epoch:  24


0.1576363503932953 0.1394012302160263
Epoch:  25


0.15601468026638032 0.13751984089612962
Epoch:  26


0.1544927316904068 0.13653684258461
Epoch:  27


0.15625076949596406 0.1679440915584564
Epoch:  28


0.15490927815437316 0.2649148166179657
Epoch    28: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  29


0.1534012109041214 0.12735316306352615
Epoch:  30


0.1527678167819977 0.12773545533418657
Epoch:  31


0.151774645447731 0.12709202021360397
Epoch:  32


0.15128210365772246 0.1269117370247841
Epoch:  33


0.15250163674354553 0.12822368890047073
Epoch:  34


0.15240304827690124 0.12689151763916015
Epoch:  35


0.1518423342704773 0.12572572231292725
Epoch:  36


0.1525116193294525 0.1261025682091713
Epoch:  37


0.1510206550359726 0.13180155158042908
Epoch:  38


0.15196088552474976 0.12641819268465043
Epoch:  39


0.15054284036159515 0.13051448315382003
Epoch:  40


0.1516183638572693 0.12618274986743927
Epoch:  41


0.1508338749408722 0.12636227756738663
Epoch    41: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  42


0.1514047998189926 0.1253862664103508
Epoch:  43


0.15016536355018617 0.12540520280599593
Epoch:  44


0.15038806498050689 0.12520421147346497
Epoch:  45


0.14991003334522246 0.1254235103726387
Epoch:  46


0.1505860483646393 0.12504118978977202
Epoch:  47


0.15099058270454407 0.1252495214343071
Epoch:  48


0.15076130151748657 0.1252213418483734
Epoch:  49


0.15188967287540436 0.1252353832125664
Epoch:  50


0.14974218666553496 0.12538570463657378
Epoch:  51


0.1512591779232025 0.12499679625034332
Epoch:  52


0.15069855570793153 0.12513919025659562
Epoch:  53


0.1514271938800812 0.12529736906290054
Epoch:  54


0.150477557182312 0.12863394618034363
Epoch:  55


0.1511200910806656 0.12677357643842696
Epoch:  56


0.14969509303569795 0.12486898899078369
Epoch:  57


0.14929808735847472 0.125491926074028
Epoch:  58


0.15242132067680358 0.12558311969041824
Epoch:  59


0.15118664383888245 0.12523186206817627
Epoch:  60


0.15021250426769256 0.12524236291646956
Epoch:  61


0.14997972965240478 0.12497018277645111
Epoch:  62


0.1504289186000824 0.1253214880824089
Epoch    62: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  63


0.1502736657857895 0.1256526604294777
Epoch:  64


0.14997250020503997 0.12558106034994126
Epoch:  65


0.1503986597061157 0.12522020190954208
Epoch:  66


0.15045671045780182 0.12532892674207688
Epoch:  67


0.14928150177001953 0.12548425644636155
Epoch:  68


0.1514043253660202 0.12548518180847168
Epoch    68: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  69


0.1503501558303833 0.1250056281685829
Epoch:  70


0.14969905018806456 0.12512952983379363
Epoch:  71
