In [1]:
# Parameters
until_x = 1


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [12]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [13]:
# Instantiate the model
model = Task5Model(31).to(device)

In [14]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6066714597714914 0.4255269340106419
Epoch:  1


0.2787562258340217 0.19254303404263087
Epoch:  2


0.18578482600482735 0.18722906495843614
Epoch:  3


0.17667575583264633 0.2246853347335543
Epoch:  4


0.1727399439425082 0.1716136932373047
Epoch:  5


0.1705692354086283 0.16509163166795457
Epoch:  6


0.17013513800260183 0.15389912681920187
Epoch:  7


0.16660451566850817 0.15098580718040466
Epoch:  8


0.16778769525321754 0.15299572476318904
Epoch:  9


0.1646108627319336 0.1513912911925997
Epoch:  10


0.1637770900049725 0.15511811205318995
Epoch:  11


0.16347845663895477 0.14090277148144587
Epoch:  12


0.16252132847502426 0.14477072017533438
Epoch:  13


0.16255528459677826 0.14456985677991593
Epoch:  14


0.15961908770574107 0.14161663502454758
Epoch:  15


0.16105853423878952 0.1441464541213853
Epoch:  16


0.16108702203711947 0.14204198441335134
Epoch:  17


0.15991854144109263 0.13669887078659876
Epoch:  18


0.1592170265880791 0.13758384649242675
Epoch:  19


0.1572604980823156 0.1400678093944277
Epoch:  20


0.15817454779470289 0.13759332363094604
Epoch:  21


0.1571537663002272 0.13464408687182836
Epoch:  22


0.1578042889769013 0.13920889688389643
Epoch:  23


0.15723237032825882 0.1326622228537287
Epoch:  24


0.15613312092987267 0.13203205381120955
Epoch:  25


0.15660090623675166 0.14260110791240418
Epoch:  26


0.155808360592739 0.13253528305462428
Epoch:  27


0.15559895175534325 0.13128829853875296
Epoch:  28


0.15594678633921855 0.13195452945572989
Epoch:  29


0.15447896114877752 0.13588741634573256
Epoch:  30


0.15386262294408437 0.13025483701910293
Epoch:  31


0.15588743581965164 0.13445020892790385
Epoch:  32


0.15438801936201146 0.12899193167686462
Epoch:  33


0.15258299901678757 0.13278221232550486
Epoch:  34


0.15300058835261576 0.13345923381192343
Epoch:  35


0.15374805194300575 0.13231407957417624
Epoch:  36


0.15250231003439105 0.1300225406885147
Epoch:  37


0.15382508812723933 0.12996146508625575
Epoch:  38


0.15296049174424764 0.12878515358482087
Epoch:  39


0.15242041405793783 0.1326788333909852
Epoch:  40


0.15253069634373123 0.13053918204137258
Epoch:  41


0.15223421317500038 0.1284763781087739
Epoch:  42


0.1518970441979331 0.13064883862222945
Epoch:  43


0.15087267273181193 0.13102450221776962
Epoch:  44


0.1509721871163394 0.13517120480537415
Epoch:  45


0.15161309008662766 0.13295378642422812
Epoch:  46


0.1511126668066592 0.1326943380492074
Epoch:  47


0.15133486164582743 0.13284739000456675
Epoch    47: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  48


0.1504875655109818 0.12845296944890702
Epoch:  49


0.14948068357802727 0.12644609489611217
Epoch:  50


0.15006080712821032 0.12622643900769098
Epoch:  51


0.14972911412651474 0.12537875132901327
Epoch:  52


0.14863836241734996 0.12553659400769643
Epoch:  53


0.14897442548661619 0.1256463070000921
Epoch:  54


0.1484490085292507 0.12557166176182882
Epoch:  55


0.14763378976164637 0.12507283368280955
Epoch:  56


0.14896998091323957 0.12491711867707116
Epoch:  57


0.1491967061081448 0.1250495527471815
Epoch:  58


0.1478823544205846 0.12501675209828786
Epoch:  59


0.14754404490058487 0.12488984423024314
Epoch:  60


0.14771966193173383 0.124446482530662
Epoch:  61


0.14696155528764468 0.12459563889673778
Epoch:  62


0.14740720713460767 0.12489065634352821
Epoch:  63


0.14729621523135417 0.12426145161901202
Epoch:  64


0.14737033481533462 0.12498070831809725
Epoch:  65


0.147884026572511 0.12407309668404716
Epoch:  66


0.14789469701212807 0.12414078946624484
Epoch:  67


0.14691929519176483 0.12454128691128322
Epoch:  68


0.14806330526197278 0.12435753962823323
Epoch:  69


0.1476848765804961 0.12427608243056706
Epoch:  70


0.14836395672849706 0.1239559639777456
Epoch:  71


0.14600006309715477 0.12496393812554223
Epoch:  72


0.14676854376857346 0.12478581709521157
Epoch:  73


0.14648386151403994 0.12405938868011747
Epoch:  74


0.1465955642429558 0.12434327815260206
Epoch:  75


0.14724302734877612 0.12468413476433073
Epoch:  76


0.14603462009816556 0.12433044080223356
Epoch    76: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  77


0.14515949624615745 0.12422902243477958
Epoch:  78


0.14668904567087018 0.12428090934242521
Epoch:  79


0.14668614155537374 0.12390040074075971
Epoch:  80


0.14763206485155467 0.12404412244047437
Epoch:  81


0.14700255482583433 0.12421667469399315
Epoch:  82


0.14534187961269068 0.12403419294527598
Epoch:  83


0.14510935142233566 0.12415369387183871
Epoch:  84


0.14652828872203827 0.12390700834138053
Epoch:  85


0.14676374559466904 0.12423302446092878
Epoch    85: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  86


0.1460344102737066 0.12449540730033602
Epoch:  87


0.14572107147526098 0.12397664466074534
Epoch:  88


0.1469466702358143 0.12425355613231659
Epoch:  89


0.14587582083972725 0.12422674681459155
Epoch:  90


0.14612799238514257 0.12397730989115578
Epoch:  91


0.14711710528747454 0.12419672523226057
Epoch    91: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  92


0.14684590052913976 0.12429936549493245
Epoch:  93


0.14542243126276377 0.12389431787388665
Epoch:  94


0.14649513645752058 0.12426476180553436
Epoch:  95


0.14703048604565697 0.1241263034088271
Epoch:  96


0.1461880396346788 0.12422582826444081
Epoch:  97


0.14561442507279887 0.12400926862444196
Epoch    97: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  98


0.146578241844435 0.12419195366757256
Epoch:  99


0.1464157003808666 0.12402264454535075
