In [1]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor



In [2]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [3]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [4]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [5]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [6]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [7]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [8]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [9]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [10]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [11]:
until_x = None

In [12]:
# Parameters
until_x = 11


In [13]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [14]:
# Instantiate the model
model = Task5Model(31).to(device)

In [15]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [16]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6329796716973588 0.4915466734341213
Epoch:  1


0.32646096719277873 0.20831815046923502
Epoch:  2


0.18892657474891558 0.16844522101538523
Epoch:  3


0.1773162145872374 0.19987677250589644
Epoch:  4


0.17525163655345505 0.16300700179168157
Epoch:  5


0.17198772204888835 0.17122681651796615
Epoch:  6


0.17259063350187764 0.1559282328401293
Epoch:  7


0.172905440668802 0.2240269035100937
Epoch:  8


0.169913920196327 1.4121757767030172
Epoch:  9


0.1691664924492707 0.15711910384041922
Epoch:  10


0.16908344223692612 0.18500972432749613
Epoch:  11


0.1674204875488539 0.15367271644728525
Epoch:  12


0.16547676840343992 0.16205063249383653
Epoch:  13


0.16447248088346944 0.1468332154410226
Epoch:  14


0.1648593307346911 0.14357404517275946
Epoch:  15


0.16405316341567683 0.14218717919928686
Epoch:  16


0.16172853353861216 0.14142255804368428
Epoch:  17


0.16089719533920288 0.13906818202563695
Epoch:  18


0.16234322214448774 0.13827823208911078
Epoch:  19


0.16175241929453774 0.1403775598321642
Epoch:  20


0.16020075654661334 0.13663678296974727
Epoch:  21


0.15864729398005717 0.13534955467496598
Epoch:  22


0.15944475217445478 0.13444848145757402
Epoch:  23


0.15881879990165298 0.13963766396045685
Epoch:  24


0.15831235776076447 0.13971143428768432
Epoch:  25


0.15705821884645 0.13298735554729188
Epoch:  26


0.15722634381539113 0.1327022218278476
Epoch:  27


0.15648178393776352 0.1325938105583191
Epoch:  28


0.15700348728411906 0.1315488645008632
Epoch:  29


0.15631865850976995 0.13164390410695756
Epoch:  30


0.1566477148114024 0.13679675864321844
Epoch:  31


0.15582825565660322 0.132496463400977
Epoch:  32


0.15590046467007818 0.13575834461620875
Epoch:  33


0.1557866781949997 0.13341455374445235
Epoch:  34


0.15577103721128926 0.13160748673336847
Epoch    34: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  35


0.15283633889378728 0.12787100885595595
Epoch:  36


0.15390887937030276 0.12656764686107635
Epoch:  37


0.15253016916481224 0.12635655062539236
Epoch:  38


0.15236268534853653 0.12598096579313278
Epoch:  39


0.1511286460870021 0.12626482439892633
Epoch:  40


0.15224037822839376 0.1254850732428687
Epoch:  41


0.15199751265951106 0.12601330344166076
Epoch:  42


0.15205849425212756 0.12561786068337305
Epoch:  43


0.15142766768867905 0.12550031393766403
Epoch:  44


0.15093534902946368 0.12544073270899908
Epoch:  45


0.15217274669054393 0.1257323377898761
Epoch:  46


0.1528940144422892 0.12575294928891317
Epoch:  47


0.15187308957447876 0.12594895384141377
Epoch:  48


0.15209643824680433 0.12527033473764146
Epoch:  49


0.15226830219900286 0.1255061456135341
Epoch:  50


0.15044092447371096 0.12468628266027995
Epoch:  51


0.15088959078531008 0.12572911381721497
Epoch:  52


0.15157620689353427 0.12539332253592356
Epoch:  53


0.15113620822494095 0.12497695748295103
Epoch:  54


0.15133682980730728 0.12436651438474655
Epoch:  55


0.1508097459335585 0.12416298368147441
Epoch:  56


0.15030077542807604 0.12456463809524264
Epoch:  57


0.15081261219205083 0.12442616266863686
Epoch:  58


0.15212627963439837 0.12441523586000715
Epoch:  59


0.151786105455579 0.12500461829560144
Epoch:  60


0.151047109752088 0.1247067962374006
Epoch:  61


0.15020793837469978 0.12493049246924263
Epoch    61: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  62


0.1500218232741227 0.12447928317955562
Epoch:  63


0.1512014781301086 0.12460984076772418
Epoch:  64


0.15032602323068156 0.12469905082668577
Epoch:  65


0.1507235926550788 0.12446660654885429
Epoch:  66


0.15038767014000867 0.12457829820258277
Epoch:  67


0.1506213086682397 0.12459025744880949
Epoch    67: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  68


0.15003873850848223 0.1243428151522364
Epoch:  69


0.1507417036069406 0.12434538240943636
Epoch:  70


0.1498559197864017 0.1241521994982447
Epoch:  71


0.1484406437422778 0.1243998184800148
Epoch:  72


0.1509413872216199 0.12451630724327904
Epoch:  73


0.14838705433381572 0.12467343040875026
Epoch    73: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  74


0.1497961854612505 0.12442195947681155
Epoch:  75


0.15043929621979996 0.12490468046494893
Epoch:  76


0.14899429316456253 0.12470704423529762
Epoch:  77


0.15099595285750725 0.12459790280887059
Epoch:  78


0.14975248599374616 0.12459275871515274
Epoch:  79


0.14978405108322967 0.12438682147434779
Epoch    79: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  80


0.14970506929062508 0.12482162245682307
Epoch:  81


0.15053301203895258 0.12455597519874573
Epoch:  82


0.1506921971166456 0.12428527964012963
Epoch:  83


0.15113612042891011 0.12415177268641335
Epoch:  84


0.15047905614247192 0.12444796626056943
Epoch:  85


0.14931836643734495 0.1241810428244727
Epoch:  86


0.15030370048574498 0.12458492389747075
Epoch:  87


0.1500975256030624 0.12432636001280375
Epoch:  88


0.15110599350284887 0.12447962271315711
Epoch:  89


0.15107311227837125 0.12453747540712357
Epoch:  90


0.14888010435813182 0.12429874390363693
Epoch:  91


0.14992398305519208 0.12442685876573835
Epoch:  92


0.15033537431343183 0.12443756631442479
Epoch:  93


0.15062020034403414 0.1245625029717173
Epoch:  94


0.15003152836013484 0.12437460997274943
Epoch:  95


0.15055046814519005 0.12429826919521604
Epoch:  96


0.15105270050667427 0.12447953330618995
Epoch:  97


0.14942149656849937 0.12482550314494542
Epoch:  98


0.1492838754847243 0.12454049182789666
Epoch:  99


0.1498080649085947 0.12422885745763779
