In [1]:
# Parameters
until_x = 6


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 96, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 96, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 96, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [12]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [13]:
# Instantiate the model
model = Task5Model(31).to(device)

In [14]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6546969223022461 0.5429434895515441
Epoch:  1


0.4380479609966278 0.2692067205905914
Epoch:  2


0.2305450677871704 0.17772945761680603
Epoch:  3


0.1849898451566696 0.17393468618392943
Epoch:  4


0.1787290406227112 0.17194222807884216
Epoch:  5


0.1751071923971176 0.16200339496135713
Epoch:  6


0.17417440056800843 0.15835855007171631
Epoch:  7


0.17139106929302217 0.16592076122760774
Epoch:  8


0.1668330466747284 0.15481332838535308
Epoch:  9


0.1650484722852707 0.14528437852859497
Epoch:  10


0.16515258133411406 0.15725825130939483
Epoch:  11


0.16310327410697936 0.13788482248783113
Epoch:  12


0.16205766677856445 0.13940736651420593
Epoch:  13


0.16073071360588073 0.15510581731796264
Epoch:  14


0.158375905752182 0.13958703577518464
Epoch:  15


0.15878805935382842 0.13336258083581926
Epoch:  16


0.1572277081012726 0.13263095319271087
Epoch:  17


0.15609048902988434 0.13142000138759613
Epoch:  18


0.15643858134746552 0.7254285097122193
Epoch:  19


0.15597298622131348 0.13315895944833755
Epoch:  20


0.15523173689842223 0.1359232023358345
Epoch:  21


0.15275396287441254 0.13258257359266282
Epoch:  22


0.15380955815315248 0.13271262645721435
Epoch:  23


0.15441330194473266 0.1282898813486099
Epoch:  24


0.15453098475933075 0.1312625527381897
Epoch:  25


0.15234834730625152 0.13267262279987335
Epoch:  26


0.15222889244556426 0.12818315923213958
Epoch:  27


0.152465518116951 0.12861929684877396
Epoch:  28


0.15105248987674713 0.1276128262281418
Epoch:  29


0.15064342200756073 0.12887117117643357
Epoch:  30


0.15098638474941253 0.1328153669834137
Epoch:  31


0.1503644049167633 0.12671565860509873
Epoch:  32


0.1506226134300232 0.12994329929351806
Epoch:  33


0.15089769363403321 0.12792313545942308
Epoch:  34


0.1493677479028702 0.13000388741493224
Epoch:  35


0.14912025094032288 0.12749046683311463
Epoch:  36


0.14844168722629547 0.12964383512735367
Epoch:  37


0.1486545544862747 0.1270300954580307
Epoch    37: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  38


0.14881912112236023 0.12409998774528504
Epoch:  39


0.14831760346889497 0.12385695725679398
Epoch:  40


0.1469956785440445 0.12317483872175217
Epoch:  41


0.14684510171413423 0.12331106215715408
Epoch:  42


0.1474854475259781 0.1235330045223236
Epoch:  43


0.1469701772928238 0.12375390976667404
Epoch:  44


0.14678505599498748 0.12293843328952789
Epoch:  45


0.14765286207199096 0.12329855412244797
Epoch:  46


0.14609645247459413 0.12387829124927521
Epoch:  47


0.14656120419502258 0.12305444031953812
Epoch:  48


0.1458744901418686 0.12324201315641403
Epoch:  49


0.1452296245098114 0.12288409918546676
Epoch:  50


0.1463579398393631 0.12368960976600647
Epoch:  51


0.14624783277511597 0.1232190415263176
Epoch:  52


0.1467486000061035 0.12343939244747162
Epoch:  53


0.1445590353012085 0.1227129802107811
Epoch:  54


0.1458038020133972 0.12346587181091309
Epoch:  55


0.14678937673568726 0.12246824949979782
Epoch:  56


0.14553077578544615 0.12298447042703628
Epoch:  57


0.1441024672985077 0.12317532002925873
Epoch:  58


0.14503887355327605 0.12344416975975037
Epoch:  59


0.14562202572822572 0.12407019138336181
Epoch:  60


0.14464696526527404 0.12363508939743043
Epoch:  61


0.14477033495903016 0.12360049039125443
Epoch    61: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  62


0.14517346501350403 0.1235991284251213
Epoch:  63


0.14511717975139618 0.12347372770309448
Epoch:  64


0.14399152457714082 0.12344229817390442
Epoch:  65


0.1453671634197235 0.12353131622076034
Epoch:  66


0.1437605130672455 0.12323878407478332
Epoch:  67


0.1449260014295578 0.12323208302259445
Epoch    67: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  68


0.14547040283679963 0.12360966354608535
Epoch:  69


0.14524267971515656 0.12335657626390457
Epoch:  70


0.14432189762592315 0.12342500537633896
Epoch:  71


0.14534563601016998 0.12333279848098755
Epoch:  72


0.14490455090999604 0.12332227677106858
Epoch:  73


0.14424881100654602 0.12308039367198945
Epoch    73: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  74


0.1462669062614441 0.12297991812229156
Epoch:  75


0.14536213517189026 0.12340293526649475
Epoch:  76


0.1462078148126602 0.12334001660346985
Epoch:  77


0.14442908465862275 0.12339981645345688
Epoch:  78


0.14378913044929503 0.12340880483388901
Epoch:  79


0.14505087375640868 0.12359638512134552
Epoch    79: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  80
