In [1]:
# Parameters
until_x = 15


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]


def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()


class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 96, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 96, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 96, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
# Instantiate the model
model = Task5Model(31).to(device)

In [12]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [13]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 15:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6558286070823669 0.6041295409202576
Epoch:  1


0.4378411817550659 0.279117751121521
Epoch:  2


0.22291420340538026 0.17847048938274385
Epoch:  3


0.17012710094451905 0.13996394872665405
Epoch:  4


0.16111611664295197 0.13754890561103822
Epoch:  5


0.1573171842098236 0.13306745886802673
Epoch:  6


0.15613823294639587 0.13337977975606918
Epoch:  7


0.15390414178371428 0.13177396059036256
Epoch:  8


0.15336863756179808 0.12853412032127381
Epoch:  9


0.15236435890197753 0.13644433468580247
Epoch:  10


0.15243918418884278 0.12897116243839263
Epoch:  11


0.1511770021915436 0.12664179354906083
Epoch:  12


0.1511134225130081 0.12971723526716233
Epoch:  13


0.14986475706100463 0.13212053775787352
Epoch:  14


0.15057145476341247 0.1264076605439186
Epoch:  15


0.14960069417953492 0.12706807553768157
Epoch:  16


0.14815356314182282 0.13128537386655809
Epoch:  17


0.14922467291355132 0.1280314326286316
Epoch:  18


0.1476053500175476 0.13042109161615373
Epoch:  19


0.14865652680397035 0.1283707082271576
Epoch:  20


0.14863554775714874 0.12870710343122482
Epoch    20: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  21


0.1452604228258133 0.12257264256477356
Epoch:  22


0.14424489617347716 0.12222441434860229
Epoch:  23


0.14532823741436005 0.12234169095754624
Epoch:  24


0.144840167760849 0.1219998762011528
Epoch:  25


0.14440508663654328 0.12220933437347412
Epoch:  26


0.14477381825447083 0.12198468297719955
Epoch:  27


0.14296665608882905 0.12156916707754135
Epoch:  28


0.14473866283893586 0.12169207185506821
Epoch:  29


0.14387628436088562 0.12196297645568847
Epoch:  30


0.14274288952350617 0.12135526537895203
Epoch:  31


0.14327065527439117 0.12167190164327621
Epoch:  32


0.14287753343582155 0.12160895764827728
Epoch:  33


0.1414281564950943 0.1214714527130127
Epoch:  34


0.14236432015895845 0.12131323814392089
Epoch:  35


0.1422540760040283 0.12085882276296615
Epoch:  36


0.14282013535499571 0.12154077887535095
Epoch:  37


0.14135785520076752 0.12095870971679687
Epoch:  38


0.14215571343898772 0.12184807360172271
Epoch:  39


0.14134204268455505 0.12142050862312317
Epoch:  40


0.14228792667388915 0.12120340913534164
Epoch:  41


0.1417287266254425 0.12097011506557465
Epoch    41: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  42


0.14144346952438355 0.12058656364679336
Epoch:  43


0.1420279335975647 0.12059361338615418
Epoch:  44


0.14107722163200379 0.12059740424156189
Epoch:  45


0.14174972891807555 0.12056143879890442
Epoch:  46


0.14201941788196565 0.1206124559044838
Epoch:  47


0.14152320504188537 0.12070035934448242
Epoch:  48


0.14198038399219512 0.12072952985763549
Epoch:  49


0.1419292914867401 0.12082891762256623
Epoch:  50


0.14111467182636261 0.12061141580343246
Epoch:  51


0.14122100293636322 0.12049508243799209
Epoch:  52


0.14094145178794862 0.1205904096364975
Epoch:  53


0.14111479222774506 0.12065959721803665
Epoch:  54


0.14130203783512116 0.12069782316684723
Epoch:  55


0.14076005816459655 0.12065025717020035
Epoch:  56


0.14167742550373078 0.12084663659334183
Epoch:  57


0.14117844462394713 0.12088849246501923
Epoch    57: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  58


0.1406201410293579 0.1206549733877182
Epoch:  59


0.1409350538253784 0.12074591964483261
Epoch:  60


0.14066695809364319 0.12061985731124877
Epoch:  61


0.13992563903331756 0.12062209099531174
Epoch:  62


0.14138501584529878 0.12074507176876068
Epoch:  63


0.1400167626142502 0.12067541182041168
Epoch    63: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  64


0.14056223452091218 0.12073871493339539
Epoch:  65


0.1406247526407242 0.120768603682518
Epoch:  66
