In [1]:
# Parameters
until_x = 9


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [12]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [13]:
# Instantiate the model
model = Task5Model(31).to(device)

In [14]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6098821259833671 0.5864378469330924
Epoch:  1


0.2865745848095095 0.5476272191320147
Epoch:  2


0.19451897732309392 0.6956147466387067
Epoch:  3


0.18963136221911456 0.19725590092795237
Epoch:  4


0.18792380956379143 0.18828458658286504
Epoch:  5


0.18878167787113706 0.19395544060638972
Epoch:  6


0.18730768440542994 0.22787788936070033
Epoch:  7


0.18781043589115143 0.5332779799188886
Epoch:  8


0.18671800196170807 4.188116277967181
Epoch:  9


0.18906323410369255 0.21983865116323745
Epoch:  10


0.18670716599838152 0.18155422593866075
Epoch:  11


0.18802563966931524 0.917450317314693
Epoch:  12


0.18776173607723132 0.22437039017677307
Epoch:  13


0.18791427845890457 0.3209562727383205
Epoch:  14


0.1875117932622497 0.439052871295384
Epoch:  15


0.18722479448125168 0.6728280101503644
Epoch:  16


0.18728448611658974 0.25697274080344606
Epoch    16: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  17


0.18717963510268443 0.230856254696846
Epoch:  18


0.1872986208748173 0.22283131522791727
Epoch:  19


0.18699364404420596 0.17562971157687052
Epoch:  20


0.1880741574474283 0.19019028970173427
Epoch:  21


0.18786454522931897 0.36905751483780996
Epoch:  22


0.18735062471918157 0.3380379463945116
Epoch:  23


0.18621454206672874 0.21220300240176065
Epoch:  24


0.18737223543025353 0.18367053142615727
Epoch:  25


0.18670580838177656 0.20999616597379958
Epoch    25: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  26


0.1872528179271801 0.2027991520506995
Epoch:  27


0.1873599774128682 0.17855413683823176
Epoch:  28


0.18701757812822187 0.18593851583344595
Epoch:  29


0.18796557671314962 0.17474875066961562
Epoch:  30


0.18705393937793938 0.17967385053634644
Epoch:  31


0.18750041562157707 0.18374543956347875
Epoch:  32


0.1874797481137353 0.18894086565290177
Epoch:  33


0.18653121187880234 0.17604010019983565
Epoch:  34


0.18750535273874128 0.2067102449280875
Epoch:  35


0.18759132116227537 0.17472397429602488
Epoch:  36


0.18652352650423307 0.1753623911312648
Epoch:  37


0.18736633255675034 0.17620236320155008
Epoch:  38


0.18654138212268417 0.1733697865690504
Epoch:  39


0.18532665073871613 0.1801504875932421
Epoch:  40


0.18825080145049738 0.1764869966677257
Epoch:  41


0.18761768735743858 0.28158140182495117
Epoch:  42


0.1863875272306236 0.17905832614217485
Epoch:  43


0.18756619699903437 0.2353243295635496
Epoch:  44


0.1866329662703179 0.17381390290600912
Epoch    44: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  45


0.1861103257617435 0.17634169331618718
Epoch:  46


0.18692912402990702 0.17766126990318298
Epoch:  47


0.18552953165930672 0.17573284251349314
Epoch:  48


0.18624476242709803 0.17483642910208022
Epoch:  49


0.187500765194764 0.18745497720582144
Epoch:  50


0.18704691207086718 0.17651784420013428
Epoch    50: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  51


0.18690445415071538 0.18654261529445648
Epoch:  52


0.18679824149286425 0.1774240050997053
Epoch:  53


0.1869171217486665 0.17640291154384613
Epoch:  54


0.1862792670726776 0.1738900329385485
Epoch:  55


0.18718693224159447 0.1773028586591993
Epoch:  56


0.18835054337978363 0.17565585247107915
Epoch    56: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  57


0.1871019352126766 0.1777974729027067
Epoch:  58


0.18725338618497592 0.21414242471967423
Epoch:  59


0.1874915417787191 0.18007887048380716
Epoch:  60


0.18599236333692395 0.1790465520960944
Epoch:  61


0.18796477486958374 0.17511855917317526
Epoch:  62


0.1880980303158631 0.1886308022907802
Epoch:  63
