In [1]:
# Parameters
until_x = 20


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]


def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()


class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 96, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 96, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 96, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
# Instantiate the model
model = Task5Model(31).to(device)

In [12]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [13]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 15:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.652729423046112 0.6186943769454956
Epoch:  1


0.43506775617599486 0.32901317477226255
Epoch:  2


0.23023958683013915 0.18366555273532867
Epoch:  3


0.18786869585514068 0.18310054242610932
Epoch:  4


0.18116936028003694 0.17402316629886627
Epoch:  5


0.17807258546352386 0.1608409821987152
Epoch:  6


0.1746409112215042 0.17537788152694703
Epoch:  7


0.17587795376777648 0.17156463861465454
Epoch:  8


0.17248787939548493 0.15539284348487853
Epoch:  9


0.1728595584630966 0.15528996288776398
Epoch:  10


0.17147189378738403 0.16276011765003204
Epoch:  11


0.16955872178077697 0.15276997685432434
Epoch:  12


0.17048133313655853 0.1590659201145172
Epoch:  13


0.16732935428619386 0.1530845731496811
Epoch:  14


0.16777361452579498 0.1505075454711914
Epoch:  15


0.16728265583515167 0.14942848384380342
Epoch:  16


0.16607745945453645 0.16659216582775116
Epoch:  17


0.1654900574684143 0.14527694284915924
Epoch:  18


0.1637563419342041 0.1440887838602066
Epoch:  19


0.16401625990867616 0.14207125306129456
Epoch:  20


0.16314710855484008 0.14186557829380037
Epoch:  21


0.16201306462287904 0.13987959623336793
Epoch:  22


0.16241947948932647 0.1442185938358307
Epoch:  23


0.16044820427894593 0.1400041937828064
Epoch:  24


0.16081689178943634 0.13721344172954558
Epoch:  25


0.15878452956676484 0.13849293887615205
Epoch:  26


0.15808357238769533 0.13315010368824004
Epoch:  27


0.15952107250690462 0.13312846571207046
Epoch:  28


0.1587568861246109 0.1381450891494751
Epoch:  29


0.15803182125091553 0.13763150125741958
Epoch:  30


0.15780368208885193 0.13743472695350648
Epoch:  31


0.1578570795059204 0.1360652655363083
Epoch:  32


0.15679904460906982 0.13473487198352813
Epoch:  33


0.1577574747800827 0.1363224506378174
Epoch    33: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  34


0.15603339195251464 0.1306299388408661
Epoch:  35


0.15555070221424103 0.13018303960561753
Epoch:  36


0.1548407965898514 0.1301298901438713
Epoch:  37


0.15549458265304567 0.1298636719584465
Epoch:  38


0.15437068581581115 0.12915078699588775
Epoch:  39


0.15464693129062654 0.1300177738070488
Epoch:  40


0.15505526065826417 0.1297848790884018
Epoch:  41


0.1552920460700989 0.128690305352211
Epoch:  42


0.15405536115169524 0.12921164482831954
Epoch:  43


0.15429791867733 0.12958544194698335
Epoch:  44


0.15355090141296387 0.12889842242002486
Epoch:  45


0.15503796100616454 0.12856804877519606
Epoch:  46


0.15402627825737 0.12886681109666825
Epoch:  47


0.15314717292785646 0.12945015877485275
Epoch:  48


0.1538491278886795 0.12928377836942673
Epoch:  49


0.15460904002189635 0.12873610109090805
Epoch:  50


0.1538144451379776 0.12861351370811464
Epoch:  51


0.15394960582256317 0.12835785895586013
Epoch:  52


0.15426737666130066 0.1283322587609291
Epoch:  53


0.152054146528244 0.12845995128154755
Epoch:  54


0.152869131565094 0.12859708070755005
Epoch:  55


0.15498399317264558 0.12832912802696228
Epoch:  56


0.15231628835201264 0.12909689992666246
Epoch:  57


0.15459278285503386 0.12833368331193923
Epoch:  58


0.1529529458284378 0.12849097847938537
Epoch    58: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  59


0.15272847950458526 0.1278586968779564
Epoch:  60


0.1532367318868637 0.12769562900066375
Epoch:  61


0.152101309299469 0.12818762212991713
Epoch:  62


0.15318111300468445 0.12775572389364243
Epoch:  63


0.1533546668291092 0.12787362039089203
Epoch:  64


0.15407173812389374 0.12802648693323135
Epoch:  65


0.15389011144638062 0.1277984231710434
Epoch:  66


0.1524003839492798 0.1279390946030617
Epoch    66: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  67


0.15308966100215912 0.12809778302907943
Epoch:  68


0.15283734500408172 0.12825669944286347
Epoch:  69


0.15259030997753142 0.12803846895694732
Epoch:  70


0.15222761750221253 0.12821725010871887
Epoch:  71


0.15214621484279633 0.12799034118652344
Epoch:  72


0.15168376743793488 0.12802959084510804
Epoch    72: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  73


0.1525828045606613 0.1280543565750122
Epoch:  74


0.15234462201595306 0.12781853824853898
Epoch:  75
