In [1]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [2]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [3]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [4]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [5]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [6]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [7]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [8]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [9]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [10]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [11]:
until_x = None

In [12]:
# Parameters
until_x = 16


In [13]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [14]:
# Instantiate the model
model = Task5Model(31).to(device)

In [15]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [16]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6355061579394985 0.4752901494503021
Epoch:  1


0.32730974377812566 0.19315872022083827
Epoch:  2


0.193581974586925 0.17522531747817993
Epoch:  3


0.18214910054529035 0.176849793110575
Epoch:  4


0.17971649081320376 0.1698351353406906
Epoch:  5


0.17850394184524948 0.1648300290107727
Epoch:  6


0.17547500093240995 0.16663953449044908
Epoch:  7


0.17413424237354383 0.16518913209438324
Epoch:  8


0.17434884728612127 0.16319716828210012
Epoch:  9


0.17128459182945457 0.1664033489567893
Epoch:  10


0.1715200293708492 0.16414631690297807
Epoch:  11


0.17056452905809558 0.1667431082044329
Epoch:  12


0.1718951483836045 0.15681552461215428
Epoch:  13


0.16764159460325498 0.15446802335126059
Epoch:  14


0.1676802196212717 0.1522586452109473
Epoch:  15


0.16706968400929426 0.14695203517164504
Epoch:  16


0.16756644200634313 0.15543708205223083
Epoch:  17


0.16673001606722135 0.1513399907520839
Epoch:  18


0.1666479859803174 0.15255977426256453
Epoch:  19


0.16705443689952026 0.1475930522595133
Epoch:  20


0.16529568828441002 0.14643987055335725
Epoch:  21


0.16386345391337936 0.14535877321447646
Epoch:  22


0.16508290534083908 0.1473010310104915
Epoch:  23


0.1631072517987844 0.14570880149091994
Epoch:  24


0.1629929300901052 0.14313179041658128
Epoch:  25


0.1618786413927336 0.1395617574453354
Epoch:  26


0.16320834731733477 0.14899425847189768
Epoch:  27


0.1626651504555264 0.13983125665358134
Epoch:  28


0.16227943067615097 0.14470740407705307
Epoch:  29


0.16054518118097977 0.1397508414728301
Epoch:  30


0.15962229185813182 0.14217913150787354
Epoch:  31


0.16034138887315183 0.13816339522600174
Epoch:  32


0.16011745583366704 0.1641682865364211
Epoch:  33


0.1612902315887245 0.13832234484808786
Epoch:  34


0.158839421900543 0.1371431222983769
Epoch:  35


0.16038979106658213 0.13984252086707524
Epoch:  36


0.15897436439990997 0.13621113449335098
Epoch:  37


0.15883869053544225 0.140695045036929
Epoch:  38


0.15763800691913915 0.13677183432238443
Epoch:  39


0.15863396630093857 0.13833037550960267
Epoch:  40


0.15860445918263616 0.1364085695573262
Epoch:  41


0.1589068158252819 0.1360932428921972
Epoch:  42


0.15616506136752464 0.13545597344636917
Epoch:  43


0.15833398176206126 0.13546756654977798
Epoch:  44


0.15686880213183327 0.1356411650776863
Epoch:  45


0.15602209922429677 0.13928572407790593
Epoch:  46


0.1571181138625016 0.13488499075174332
Epoch:  47


0.15615001683299606 0.13601510439600265
Epoch:  48


0.15726875695022377 0.13197020441293716
Epoch:  49


0.1562648790108191 0.1343580526965005
Epoch:  50


0.1560526607004372 0.13443922145026072
Epoch:  51


0.1577800794227703 0.13766741859061377
Epoch:  52


0.15591111859759768 0.13414235519511358
Epoch:  53


0.15538224336263295 0.13159940817526408
Epoch:  54


0.15525476916416273 0.13358448445796967
Epoch:  55


0.15450088961704359 0.1307074385029929
Epoch:  56


0.15474066863188873 0.13346782539572036
Epoch:  57


0.15481594084082423 0.13262699012245452
Epoch:  58


0.15588280196125442 0.13315689670188086
Epoch:  59


0.1545368408834612 0.1322153838617461
Epoch:  60


0.15455761914317673 0.1379502011196954
Epoch:  61


0.15328613447176442 0.1298536562493869
Epoch:  62


0.15441557201179298 0.13266766922814505
Epoch:  63


0.15410493072625753 0.13739634305238724
Epoch:  64


0.15512549393885844 0.13313237045492446
Epoch:  65


0.1541759971831296 0.1331558557493346
Epoch:  66


0.15251568080605687 0.13208283803292684
Epoch:  67


0.15426097367260908 0.12965828606060573
Epoch:  68


0.15278893225901835 0.1288579904607364
Epoch:  69


0.1540899466018419 0.1338972406727927
Epoch:  70


0.15196704783955137 0.13218407652207784
Epoch:  71


0.1540432101165926 0.1297388928277152
Epoch:  72


0.15335976755296862 0.12986749517066137
Epoch:  73


0.15351361237667702 0.12953121853726252
Epoch:  74


0.15239559234799566 0.1325434297323227
Epoch    74: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  75


0.15120014508028287 0.1293343592967306
Epoch:  76


0.1509071503942077 0.12977329322269984
Epoch:  77


0.15105114998044195 0.12862227112054825
Epoch:  78


0.1498172943656509 0.127768232354096
Epoch:  79


0.1507221496588475 0.12836591047900064
Epoch:  80


0.15112375971433278 0.12747957876750401
Epoch:  81


0.15106250144339897 0.12851768944944655
Epoch:  82


0.15063049825462135 0.1284691563674382
Epoch:  83


0.1513774117102494 0.1290639606969697
Epoch:  84


0.14900378963431796 0.12701936279024398
Epoch:  85


0.1502595523724685 0.1275575182267598
Epoch:  86


0.14996991366953463 0.1283366403409413
Epoch:  87


0.14963094366563334 0.12861127299921854
Epoch:  88


0.15045823801208186 0.12754898624760763
Epoch:  89


0.15070620461090192 0.12775432424885885
Epoch:  90


0.1500457344022957 0.1272412644965308
Epoch    90: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  91


0.14798721791924657 0.12749405950307846
Epoch:  92


0.14977252362547694 0.1279034497482436
Epoch:  93


0.14708117939330437 0.12682811596563884
Epoch:  94


0.1494183161774197 0.12741157625402724
Epoch:  95


0.14919759856688009 0.12719852051564626
Epoch:  96


0.14917844010365977 0.12660456555230276
Epoch:  97


0.14864202770026955 0.1277856986437525
Epoch:  98


0.1494892964491973 0.12776174076965877
Epoch:  99


0.1489674658388705 0.12795067791427886
