In [1]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [2]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [3]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [4]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [5]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [6]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [7]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [8]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [9]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [10]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [11]:
until_x = None

In [12]:
# Parameters
until_x = 15


In [13]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [14]:
# Instantiate the model
model = Task5Model(31).to(device)

In [15]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [16]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6341804894241126 0.5660088402884347
Epoch:  1


0.3364431036485208 0.20921022977147782
Epoch:  2


0.1917281066243713 0.17191278082983835
Epoch:  3


0.1795745036891989 0.174101716705731
Epoch:  4


0.17509748686004328 0.2961834136928831
Epoch:  5


0.173682367479479 0.1661783286503383
Epoch:  6


0.1713370680809021 0.1682840564421245
Epoch:  7


0.17129086078824224 0.16150599718093872
Epoch:  8


0.17021427887517052 0.15495645148413523
Epoch:  9


0.16752749802293004 0.1582137303692954
Epoch:  10


0.16721222005985878 0.15554177335330419
Epoch:  11


0.16485139566498833 0.1418816979442324
Epoch:  12


0.16462070555300326 0.14964617363044194
Epoch:  13


0.16570451654292442 0.14343706305537904
Epoch:  14


0.1630320416108982 0.14353411006075995
Epoch:  15


0.16310772138672905 0.14113225042819977
Epoch:  16


0.1642520995558919 0.14376277582986013
Epoch:  17


0.16170354509675824 0.14663248083421163
Epoch:  18


0.16161923513219162 0.1433672628232411
Epoch:  19


0.16146996456223564 0.1384224934237344
Epoch:  20


0.16033995997261358 0.13916141539812088
Epoch:  21


0.1603207201571078 0.13764898159674235
Epoch:  22


0.15776623342488263 0.13487563069377626
Epoch:  23


0.1590495065257356 0.1383702318583216
Epoch:  24


0.1589012166132798 0.1344075852206775
Epoch:  25


0.15963811004484021 0.1422201884644372
Epoch:  26


0.15948701065939827 0.13368839557681764
Epoch:  27


0.1582494905671558 0.1329973361321858
Epoch:  28


0.15774403270837423 0.13511243143251964
Epoch:  29


0.15641634649521596 0.1393110230565071
Epoch:  30


0.15754328950031385 0.133053081376212
Epoch:  31


0.15726416457343745 0.13534193698848998
Epoch:  32


0.15673891475071777 0.1312250175646373
Epoch:  33


0.15562927642384092 0.1335924384849412
Epoch:  34


0.1554073953145259 0.13156645745038986
Epoch:  35


0.15501836345002457 0.1308067228112902
Epoch:  36


0.1542123616547198 0.13109352865389415
Epoch:  37


0.15554265677928925 0.13435996643134526
Epoch:  38


0.1556245402709858 0.13224886038473674
Epoch:  39


0.16277280610960884 0.1473158108336585
Epoch:  40


0.16173298455573418 0.14032358463321412
Epoch:  41


0.15895235699576302 0.1325169141803469
Epoch    41: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  42


0.15637224026628443 0.13107767275401525
Epoch:  43


0.15524075200428833 0.1299196258187294
Epoch:  44


0.1555093522812869 0.13118066851581847
Epoch:  45


0.15740293344935855 0.12960629165172577
Epoch:  46


0.15560305964302373 0.13158004411629268
Epoch:  47


0.15535806199988803 0.1291270585996764
Epoch:  48


0.15453862459272952 0.12916928316865647
Epoch:  49


0.15493589921577558 0.1291650597538267
Epoch:  50


0.15376217784108343 0.12863744582448686
Epoch:  51


0.15420909870315241 0.12864622260843003
Epoch:  52


0.15443278043656736 0.12844919945512498
Epoch:  53


0.1543494519349691 0.12830599290984018
Epoch:  54


0.15593259680915522 0.1283333514417921
Epoch:  55


0.15453205559704755 0.1278309268610818
Epoch:  56


0.1537879890686757 0.12835331154721125
Epoch:  57


0.15410587554042404 0.12785538818155015
Epoch:  58


0.15415189515899969 0.12826427285160338
Epoch:  59


0.15414280383973508 0.12744533802781785
Epoch:  60


0.1547394531804162 0.12749622868640081
Epoch:  61


0.15323100379995397 0.12819132102387293
Epoch:  62


0.1533598738747674 0.12775251801524842
Epoch:  63


0.1522014809621347 0.1277389686022486
Epoch:  64


0.15351428252619667 0.12779823584215982
Epoch:  65


0.15249663752478523 0.12794341892004013
Epoch    65: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  66


0.15379290846554008 0.12767944378512247
Epoch:  67


0.15206014868375417 0.12732011931283133
Epoch:  68


0.1535115403098029 0.12752280490739004
Epoch:  69


0.15250735105694951 0.12755558852638518
Epoch:  70


0.15259738306741458 0.12738959278379167
Epoch:  71


0.15340180936697367 0.12768734672239848
Epoch:  72


0.1535677414488148 0.12756529982600892
Epoch:  73


0.15334480718986407 0.1275104358792305
Epoch    73: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  74


0.1532747540119532 0.12722952238151006
Epoch:  75


0.15218293827933235 0.12765812022345407
Epoch:  76


0.1532136157557771 0.12723474098103388
Epoch:  77


0.1534015258421769 0.12742023915052414
Epoch:  78


0.15355857119366928 0.12751934358051845
Epoch:  79


0.15287293654841347 0.12745886189596994
Epoch:  80


0.15321061417863174 0.1275654435157776
Epoch    80: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  81


0.1535214173632699 0.12762939291340963
Epoch:  82


0.15244819103060542 0.12746254674025945
Epoch:  83


0.15215926959707932 0.12757041518177306
Epoch:  84


0.15298269205802195 0.12744182348251343
Epoch:  85


0.15264998940197197 0.12771752583129065
Epoch:  86


0.15098557399736867 0.12749866396188736
Epoch    86: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  87


0.15212905970779625 0.12716413395745413
Epoch:  88


0.15328405394747452 0.12771367920296534
Epoch:  89


0.15289717147479187 0.12740978917905263
Epoch:  90


0.15243481341246012 0.12742795369454793
Epoch:  91


0.1515619420521968 0.12725695222616196
Epoch:  92


0.15206006733146873 0.1272475357566561
Epoch:  93


0.15296283605936412 0.12731478682586125
Epoch:  94


0.15281126265590256 0.12756412582738058
Epoch:  95


0.15275810618658323 0.12749409249850682
Epoch:  96


0.15395193929607803 0.12728008308580943
Epoch:  97


0.1528659462928772 0.1280230720128332
Epoch:  98


0.15409893884852127 0.12740805319377355
Epoch:  99


0.15369342871614405 0.12730525327580317
