In [1]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor



In [2]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [3]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [4]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [5]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [6]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [7]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [8]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [9]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [10]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [11]:
until_x = None

In [12]:
# Parameters
until_x = 5


In [13]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [14]:
# Instantiate the model
model = Task5Model(31).to(device)

In [15]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [16]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6345631303013982 0.5315125840050834
Epoch:  1


0.32733406650053487 0.2124876699277333
Epoch:  2


0.18621274587270376 0.4840893064226423
Epoch:  3


0.17417371796595083 0.1599210947751999
Epoch:  4


0.17151461863839948 0.15787623822689056
Epoch:  5


0.16846631870076462 0.17147275592599595
Epoch:  6


0.16621673993162206 0.15155618212052754
Epoch:  7


0.16516435307425423 0.14096047197069442
Epoch:  8


0.16346814745181315 0.14002491853066854
Epoch:  9


0.16231034293368057 0.14342153923852102
Epoch:  10


0.16220776132635167 0.14649306876318796
Epoch:  11


0.161982471072996 0.14100436972720282
Epoch:  12


0.16028273468082016 0.14283383744103567
Epoch:  13


0.15969991039585424 0.13568092989070074
Epoch:  14


0.15947646466461388 0.14105620022330964
Epoch:  15


0.15880204898280068 0.13554054924419948
Epoch:  16


0.1576028478306693 0.13747504353523254
Epoch:  17


0.1569845036074922 0.13205956667661667
Epoch:  18


0.15496273459614934 0.1393644937447139
Epoch:  19


0.15594649798161275 0.13306126211370742
Epoch:  20


0.15476320442315694 0.13075467305524008
Epoch:  21


0.15490228863986763 0.1334552445581981
Epoch:  22


0.15433199341232712 0.13355084189346858
Epoch:  23


0.15443858866756027 0.1347269607441766
Epoch:  24


0.15519261400441867 0.13714748195239476
Epoch:  25


0.15364599469545726 0.13149333319493703
Epoch:  26


0.15321670271254875 0.1301740227001054
Epoch:  27


0.15282768212460182 0.13839212485722133
Epoch:  28


0.15320474151018504 0.13191183443580354
Epoch:  29


0.1529834274504636 0.1371610930987767
Epoch:  30


0.15169869041120684 0.12941382080316544
Epoch:  31


0.15297153793476723 0.1331565667475973
Epoch:  32


0.1516246384865529 0.13235472674880708
Epoch:  33


0.15207288998204307 0.13401012761252268
Epoch:  34


0.1516553376977508 0.12963067420891353
Epoch:  35


0.15145439110897682 0.13068825432232448
Epoch:  36


0.15102320990046939 0.12686843957219804
Epoch:  37


0.15035977959632874 0.12931122524397715
Epoch:  38


0.15024400965587512 0.12970855087041855
Epoch:  39


0.1516926973252683 0.12947635459048407
Epoch:  40


0.1505119804594968 0.1256193933742387
Epoch:  41


0.14883877578619364 0.12707893231085368
Epoch:  42


0.14913769146880587 0.12758255217756545
Epoch:  43


0.14769927029674118 0.12713554607970373
Epoch:  44


0.15024478693266172 0.130179192338671
Epoch:  45


0.14782695512513858 0.12639129694019044
Epoch:  46


0.14849608935214378 0.12777221841471537
Epoch    46: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  47


0.1474649737010131 0.12265141414744514
Epoch:  48


0.14622929047893835 0.12269890095506396
Epoch:  49


0.14574500717021324 0.12233380334717887
Epoch:  50


0.1451484021302816 0.12259733251162938
Epoch:  51


0.14610337526411624 0.12280837020703725
Epoch:  52


0.1446223262999509 0.1224774420261383
Epoch:  53


0.1453234674157323 0.12240820910249438
Epoch:  54


0.14568235866121343 0.12283739128283092
Epoch:  55


0.14539761196922613 0.12299864206995283
Epoch    55: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  56


0.1441621832750939 0.12250576274735588
Epoch:  57


0.14484874461148237 0.12273131204502923
Epoch:  58


0.1445946371233141 0.1230208660875048
Epoch:  59


0.1440852477743819 0.12247460974114281
Epoch:  60


0.14526856308047836 0.12252699690205711
Epoch:  61


0.14413979085716042 0.12260829657316208
Epoch    61: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  62


0.1448345764263256 0.12256809643336705
Epoch:  63


0.14406362256488284 0.12266024202108383
Epoch:  64


0.1449650671031024 0.12284397866044726
Epoch:  65


0.14362941400424853 0.12264732590743474
Epoch:  66


0.14451767061207746 0.12251724834953036
Epoch:  67


0.14362139758226034 0.12261206550257546
Epoch    67: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  68


0.14377270719489535 0.12259571786437716
Epoch:  69


0.14358492356699867 0.12259795836039952
Epoch:  70


0.1459782973334596 0.12260797300509044
Epoch:  71


0.14412062313105609 0.12263019595827375
Epoch:  72


0.14415759814752116 0.12289888518197196
Epoch:  73


0.1444703445241258 0.1228802353143692
Epoch    73: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  74
