In [1]:
# Parameters
until_x = -1


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]


def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()


class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 96, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 96, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 96, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
# Instantiate the model
model = Task5Model(31).to(device)

In [12]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [13]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 15:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6505141520500183 0.5780719637870788
Epoch:  1


0.4317058289051056 0.25661020874977114
Epoch:  2


0.21851473569869995 0.19029346406459807
Epoch:  3


0.16901785075664522 0.14535107612609863
Epoch:  4


0.16123679876327515 0.13180022835731506
Epoch:  5


0.15713222980499267 0.14141184091567993
Epoch:  6


0.1547602868080139 0.1312655419111252
Epoch:  7


0.15406233429908753 0.1286025494337082
Epoch:  8


0.1545343267917633 0.13266034722328185
Epoch:  9


0.15187518298625946 0.12873151451349257
Epoch:  10


0.15306717693805694 0.12887340039014816
Epoch:  11


0.15179608285427093 0.12793720066547393
Epoch:  12


0.150515074133873 0.1258033901453018
Epoch:  13


0.14961596190929413 0.13056480288505554
Epoch:  14


0.15019604206085205 0.12734697312116622
Epoch:  15


0.15000910997390748 0.12962153404951096
Epoch:  16


0.14795434653759001 0.13056647777557373
Epoch:  17


0.1488344168663025 0.12692145109176636
Epoch:  18


0.14755199372768402 0.13833667188882828
Epoch    18: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  19


0.14671285033226014 0.12189095914363861
Epoch:  20


0.1455587190389633 0.12524933218955994
Epoch:  21


0.1445262736082077 0.12094395011663436
Epoch:  22


0.14410954415798188 0.12129385620355607
Epoch:  23


0.1452680903673172 0.12108628004789353
Epoch:  24


0.14476528644561767 0.12097795456647872
Epoch:  25


0.14340607941150665 0.12072633653879165
Epoch:  26


0.14393279135227202 0.12071176618337631
Epoch:  27


0.1429845440387726 0.12056261748075485
Epoch:  28


0.1429726505279541 0.12060788422822952
Epoch:  29


0.1442202365398407 0.12072889655828475
Epoch:  30


0.1422567892074585 0.12040254026651383
Epoch:  31


0.14361864686012268 0.12071815133094788
Epoch:  32


0.1427466481924057 0.12044679075479507
Epoch:  33


0.14134477972984313 0.12068436592817307
Epoch:  34


0.1435166960954666 0.12034250050783157
Epoch:  35


0.14315471172332764 0.12059348076581955
Epoch:  36


0.14290420174598695 0.12049671709537506
Epoch:  37


0.143383971452713 0.12077939361333848
Epoch:  38


0.14352553308010102 0.12128640115261077
Epoch:  39


0.14115701019763946 0.12180222570896149
Epoch:  40


0.1423477828502655 0.12050850838422775
Epoch    40: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  41


0.14253517150878905 0.12058030068874359
Epoch:  42


0.14251744151115417 0.12030737251043319
Epoch:  43


0.1418326413631439 0.1202689528465271
Epoch:  44


0.1408343768119812 0.12038237005472183
Epoch:  45


0.14273565530776977 0.12033914476633072
Epoch:  46


0.14119022965431213 0.12054902762174606
Epoch:  47


0.14107503592967988 0.12034578770399093
Epoch:  48


0.14131325840950013 0.12040034234523773
Epoch:  49


0.14117975711822509 0.120277638733387
Epoch    49: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  50


0.14087941408157348 0.12037410587072372
Epoch:  51


0.14149108827114104 0.12048338055610656
Epoch:  52


0.14167615056037902 0.12019693255424499
Epoch:  53


0.14103687822818756 0.12025974541902543
Epoch:  54


0.14249306976795195 0.12042811810970307
Epoch:  55


0.14104827463626862 0.12057583183050155
Epoch:  56


0.1413905429840088 0.12043429315090179
Epoch:  57


0.1411130464076996 0.1202351450920105
Epoch:  58


0.14051798343658448 0.12033349573612213
Epoch    58: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  59


0.14345487236976623 0.12034292221069336
Epoch:  60


0.1410598486661911 0.12034777253866195
Epoch:  61


0.14080902218818664 0.12048892378807068
Epoch:  62


0.14132136404514312 0.1204051822423935
Epoch:  63


0.1404259669780731 0.12045430541038513
Epoch:  64


0.1411165028810501 0.12053553313016892
Epoch    64: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  65


0.14047537326812745 0.12056059837341308
Epoch:  66


0.14015221774578093 0.12041523307561874
Epoch:  67
