In [1]:
# Parameters
until_x = 4


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]


def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()


class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 96, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 96, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 96, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
# Instantiate the model
model = Task5Model(31).to(device)

In [12]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [13]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 15:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6561709189414978 0.5652978777885437
Epoch:  1


0.4425605607032776 0.32147350907325745
Epoch:  2


0.23016785562038422 0.19681670367717743
Epoch:  3


0.18265182077884673 0.18525252640247344
Epoch:  4


0.17245682179927826 0.16895240545272827
Epoch:  5


0.16952572882175446 0.1680770307779312
Epoch:  6


0.16626861989498137 0.14809494018554686
Epoch:  7


0.16606248080730437 0.14132491648197174
Epoch:  8


0.16256963789463044 0.13880274593830108
Epoch:  9


0.16190409839153289 0.14838855266571044
Epoch:  10


0.16017556071281433 0.13849081397056578
Epoch:  11


0.16106611788272857 0.14322638809680938
Epoch:  12


0.1602140599489212 0.13705094158649445
Epoch:  13


0.15820330560207366 0.13878402411937713
Epoch:  14


0.15793487608432769 0.13231619745492934
Epoch:  15


0.15659377336502076 0.13723287284374236
Epoch:  16


0.15712804794311525 0.13738525807857513
Epoch:  17


0.15773131132125853 0.13376250863075256
Epoch:  18


0.15786137580871581 0.13543182909488677
Epoch:  19


0.15581760585308074 0.1306471198797226
Epoch:  20


0.1554490351676941 0.14001134037971497
Epoch:  21


0.15491968929767608 0.14155074954032898
Epoch:  22


0.1538233357667923 0.133688722550869
Epoch:  23


0.1531471937894821 0.13531767576932907
Epoch:  24


0.15417197942733765 0.1310421645641327
Epoch:  25


0.15454228341579437 0.12996965199708937
Epoch:  26


0.1516736215353012 0.1314457267522812
Epoch:  27


0.15409523606300354 0.12757623493671416
Epoch:  28


0.15297279477119446 0.1350978285074234
Epoch:  29


0.15251529037952424 0.1312161535024643
Epoch:  30


0.15230294108390807 0.13069413602352142
Epoch:  31


0.15159531831741332 0.12791766226291656
Epoch:  32


0.15208669483661652 0.1298660844564438
Epoch:  33


0.15203719556331635 0.1301568865776062
Epoch    33: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  34


0.15099605917930603 0.12503244429826738
Epoch:  35


0.1494014483690262 0.1247756078839302
Epoch:  36


0.14984629273414612 0.12417417019605637
Epoch:  37


0.14903632164001465 0.12422904074192047
Epoch:  38


0.14933284103870392 0.12352010905742646
Epoch:  39


0.14932482779026032 0.12372656464576721
Epoch:  40


0.14740747094154358 0.12292950749397277
Epoch:  41


0.14998725712299346 0.12262318581342697
Epoch:  42


0.14812156975269317 0.12279855161905288
Epoch:  43


0.14713667392730712 0.12366430461406708
Epoch:  44


0.1477124148607254 0.12283830344676971
Epoch:  45


0.14781255006790162 0.12250621765851974
Epoch:  46


0.14741992115974425 0.12304098010063172
Epoch:  47


0.14651417672634126 0.12264935523271561
Epoch:  48


0.14643796443939208 0.12251679599285126
Epoch:  49


0.1469496977329254 0.1221523031592369
Epoch:  50


0.14716118395328523 0.12243641018867493
Epoch:  51


0.1465534108877182 0.12171696573495865
Epoch:  52


0.14599897861480712 0.12205960005521774
Epoch:  53


0.14620178163051606 0.12191026210784912
Epoch:  54


0.1464245516061783 0.12197278290987015
Epoch:  55


0.1468757200241089 0.1224783942103386
Epoch:  56


0.14679713249206544 0.1216326355934143
Epoch:  57


0.1456258249282837 0.12191665470600128
Epoch:  58


0.1467207545042038 0.12192274034023284
Epoch:  59


0.14755456745624543 0.12191238105297089
Epoch:  60


0.14650823116302492 0.12173914015293122
Epoch:  61


0.14646265685558318 0.1216474324464798
Epoch:  62


0.14607618629932403 0.12185602933168412
Epoch    62: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  63


0.14521075129508973 0.12165096253156663
Epoch:  64


0.14655503273010254 0.1215953916311264
Epoch:  65


0.1456150805950165 0.12142010927200317
Epoch:  66


0.1465175223350525 0.12182604968547821
Epoch:  67


0.14552021503448487 0.12180588692426682
Epoch:  68


0.14535520792007448 0.12155928760766983
Epoch:  69


0.1449657189846039 0.12150925993919373
Epoch:  70


0.14551995277404786 0.12168506234884262
Epoch:  71


0.1455095511674881 0.12168993651866913
Epoch    71: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  72


0.1453443855047226 0.12161224782466888
Epoch:  73


0.14471224665641785 0.12156912088394164
Epoch:  74


0.14642785549163817 0.1215702548623085
Epoch:  75


0.14490714132785798 0.12147655189037324
Epoch:  76


0.14496877133846284 0.12157750427722931
Epoch:  77


0.14446451306343078 0.12157191783189773
Epoch    77: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  78


0.14524702847003937 0.12162799388170242
Epoch:  79


0.14593541979789734 0.12175955027341842
Epoch:  80
