In [1]:
# Parameters
until_x = 9


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 96, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 96, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 96, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [12]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [13]:
# Instantiate the model
model = Task5Model(31).to(device)

In [14]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6505200839042664 0.5809126615524292
Epoch:  1


0.42975414633750914 0.3099841773509979
Epoch:  2


0.2221325570344925 0.17162040770053863
Epoch:  3


0.17606409788131713 0.20211537182331085
Epoch:  4


0.16600116491317748 0.14221421778202056
Epoch:  5


0.1618267422914505 0.1411706954240799
Epoch:  6


0.15883941650390626 0.1341147840023041
Epoch:  7


0.15848109543323516 0.14266330003738403
Epoch:  8


0.1579832738637924 0.14670783579349517
Epoch:  9


0.1562094980478287 0.14029916524887084
Epoch:  10


0.1542511749267578 0.1329272657632828
Epoch:  11


0.1534079796075821 0.13629670143127443
Epoch:  12


0.1543656951189041 0.13075264543294907
Epoch:  13


0.15340241372585298 0.13124581426382065
Epoch:  14


0.15240014851093292 0.12731814831495286
Epoch:  15


0.15146683931350707 0.13193780481815337
Epoch:  16


0.15168094277381897 0.1285290613770485
Epoch:  17


0.15182076394557953 0.13025525510311126
Epoch:  18


0.15083889842033385 0.12965240180492402
Epoch:  19


0.14991218626499175 0.1302835986018181
Epoch:  20


0.1498202806711197 0.1267281338572502
Epoch:  21


0.14954109191894532 0.1270521342754364
Epoch:  22


0.1504926985502243 0.12762111872434617
Epoch:  23


0.1486998802423477 0.1279677763581276
Epoch:  24


0.14869571030139922 0.13348724246025084
Epoch:  25


0.14917372465133666 0.12705666571855545
Epoch:  26


0.1485451018810272 0.1312612384557724
Epoch    26: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  27


0.14673459589481352 0.12484818249940872
Epoch:  28


0.14431902289390564 0.12417324036359786
Epoch:  29


0.1463049989938736 0.12333793193101883
Epoch:  30


0.14597250521183014 0.12374860495328903
Epoch:  31


0.1453960806131363 0.12377162128686905
Epoch:  32


0.14595299065113068 0.12397455871105194
Epoch:  33


0.14601649045944215 0.12340094894170761
Epoch:  34


0.14428015172481537 0.12335619628429413
Epoch:  35


0.14382599472999572 0.12369272410869599
Epoch    35: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  36


0.14395386695861817 0.12346889525651931
Epoch:  37


0.1451367747783661 0.12344165742397309
Epoch:  38


0.14487432479858398 0.12323759943246841
Epoch:  39


0.14416140377521514 0.12330806404352188
Epoch:  40


0.144583500623703 0.12335644364356994
Epoch:  41


0.14529522240161896 0.12363545149564743
Epoch:  42


0.14411778271198272 0.12344184070825577
Epoch:  43


0.14536354303359986 0.12356074750423432
Epoch:  44


0.14576506972312928 0.1232086330652237
Epoch:  45


0.14391812562942505 0.12336688339710236
Epoch:  46


0.14419248521327974 0.12316632717847824
Epoch:  47


0.1446780037879944 0.12336990982294083
Epoch:  48


0.14368677496910096 0.12322673052549363
Epoch:  49


0.1438675343990326 0.12316444218158722
Epoch:  50


0.14489370346069336 0.12316505908966065
Epoch:  51


0.14550307512283325 0.12310530990362167
Epoch:  52


0.1451865142583847 0.12341836243867874
Epoch:  53


0.1432131153345108 0.12350807189941407
Epoch:  54


0.14450208842754364 0.12320983409881592
Epoch:  55


0.14426505088806152 0.12352429777383804
Epoch:  56


0.14396844685077667 0.12317685186862945
Epoch:  57


0.14413834035396575 0.12320479452610016
Epoch    57: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  58


0.14454099118709565 0.12328082472085952
Epoch:  59


0.14458289504051208 0.12328954041004181
Epoch:  60


0.1436353713274002 0.12334703356027603
Epoch:  61


0.14444840729236602 0.1232931450009346
Epoch:  62


0.14475683748722076 0.12321489006280899
Epoch:  63


0.14293864786624907 0.12321360260248185
Epoch    63: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  64


0.14364617347717284 0.12346346080303192
Epoch:  65


0.1447508680820465 0.12341277152299882
Epoch:  66


0.1439689987897873 0.12326857298612595
Epoch:  67


0.1441967988014221 0.12320331037044525
Epoch:  68


0.1434965807199478 0.12327134311199188
Epoch:  69


0.14460269808769227 0.12325597554445267
Epoch    69: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  70


0.14489632368087768 0.12341363430023193
Epoch:  71


0.14407246530056 0.12351764887571334
Epoch:  72


0.1444444799423218 0.12332580387592315
Epoch:  73


0.14438169896602632 0.1233380228281021
Epoch:  74


0.14498041391372682 0.12328956574201584
Epoch:  75


0.14362601816654205 0.12322011440992356
Epoch:  76
