In [1]:
# Parameters
until_x = 6


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [12]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [13]:
# Instantiate the model
model = Task5Model(31).to(device)

In [14]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6035094317552205 0.42755850298064096
Epoch:  1


0.2755751295669659 0.1666697817189353
Epoch:  2


0.17979938557019104 0.16730625927448273
Epoch:  3


0.1706087943669912 0.16101102318082536
Epoch:  4


0.16560473997850675 0.16037843695708684
Epoch:  5


0.1633655473187163 0.1506212204694748
Epoch:  6


0.1629499595712971 0.1389241452727999
Epoch:  7


0.1605182185366347 0.14577639954430716
Epoch:  8


0.15928361665558172 0.1530174676861082
Epoch:  9


0.15955788017930211 0.13413324632814952
Epoch:  10


0.15658730872579524 0.13771704797233855
Epoch:  11


0.1563738222057755 0.1373196148446628
Epoch:  12


0.157068333915762 0.13761737516948155
Epoch:  13


0.15670569442413948 0.13096832377570017
Epoch:  14


0.1554607648301769 0.1322293717946325
Epoch:  15


0.15411669458891894 0.1332556445683752
Epoch:  16


0.15316139805961299 0.12955564899103983
Epoch:  17


0.15422672637411067 0.13017249745982035
Epoch:  18


0.15509835932705854 0.12796274040426528
Epoch:  19


0.15456027316080556 0.1297529467514583
Epoch:  20


0.15179417866307335 0.12907800184828894
Epoch:  21


0.15317982274132805 0.12776037092719758
Epoch:  22


0.1521447087461884 0.1292313731142453
Epoch:  23


0.1527885834912996 0.13375055257763183
Epoch:  24


0.1511839060364543 0.13065172199692046
Epoch:  25


0.1517267633934279 0.1301773956843785
Epoch:  26


0.15200715169713302 0.12680957785674504
Epoch:  27


0.15133212345677452 0.1297261118888855
Epoch:  28


0.15053748037364031 0.12953033724001475
Epoch:  29


0.1508424269991952 0.12726162799767085
Epoch:  30


0.15073695577479698 0.13074208689587458
Epoch:  31


0.15070815062200701 0.12772468051740102
Epoch:  32


0.1498867378041551 0.12682266426937921
Epoch    32: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  33


0.1485881148963361 0.12445792023624692
Epoch:  34


0.14805421877551722 0.12456803875310081
Epoch:  35


0.14631113752320007 0.12437027586357934
Epoch:  36


0.1466497255338205 0.12420225994927543
Epoch:  37


0.14654548506479007 0.12439381863389697
Epoch:  38


0.1468343883752823 0.12347237127167839
Epoch:  39


0.14554499049444455 0.12302138443504061
Epoch:  40


0.14683239766069361 0.12350815321717944
Epoch:  41


0.14545189669808825 0.12327137270144053
Epoch:  42


0.14664215131385908 0.12324184392179761
Epoch:  43


0.14633974996772972 0.12303598012242999
Epoch:  44


0.14576038718223572 0.12302388357264656
Epoch:  45


0.1453521835642892 0.12370995325701577
Epoch    45: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  46


0.14647066029342445 0.12254437165600914
Epoch:  47


0.14481228992745682 0.12308783829212189
Epoch:  48


0.1462216079235077 0.1230613961815834
Epoch:  49


0.14538440792947202 0.12279786488839559
Epoch:  50


0.1455926766266694 0.12291858664580754
Epoch:  51


0.14574956773100672 0.12285351966108594
Epoch:  52


0.14560765108546694 0.12303323830877032
Epoch    52: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  53


0.14609822228148178 0.1223822363785335
Epoch:  54


0.14619281082539945 0.12317324642624174
Epoch:  55


0.14522181048586563 0.12296474299260549
Epoch:  56


0.14615730860748807 0.12282403026308332
Epoch:  57


0.14478214568382985 0.12270256131887436
Epoch:  58


0.14427304106789665 0.12317166051694325
Epoch:  59


0.14685265437976733 0.12271949329546519
Epoch    59: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  60


0.1456361505630854 0.12333143076726369
Epoch:  61


0.1446755830500577 0.1228085532784462
Epoch:  62


0.1458526765172546 0.12303746065923146
Epoch:  63


0.14468745122084747 0.12321356683969498
Epoch:  64


0.14520539504450722 0.12262832692691258
Epoch:  65


0.14774098186879545 0.12289634879146304
Epoch    65: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  66


0.14570042209045306 0.12296075693198613
Epoch:  67


0.14496520000535087 0.12275887919323784
Epoch:  68


0.14487571772691366 0.12317489300455366
Epoch:  69


0.14694334204132492 0.12304278782435826
Epoch:  70


0.14456552186527769 0.12276795932224818
Epoch:  71


0.14631529031573115 0.12323826764311109
Epoch:  72


0.14454210207268997 0.12279283148901803
Epoch:  73


0.14420310548833898 0.1229515043752534
Epoch:  74


0.14535464950509974 0.12294227310589381
Epoch:  75


0.14510095038929502 0.12285374850034714
Epoch:  76


0.144966642196114 0.12320915503161294
Epoch:  77


0.144648524152266 0.12257183236735207
Epoch:  78
