In [1]:
# Parameters
until_x = 10


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]


def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()


class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 96, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 96, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 96, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
# Instantiate the model
model = Task5Model(31).to(device)

In [12]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [13]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 15:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6589180588722229 0.5736743927001953
Epoch:  1


0.4412520730495453 0.31476283073425293
Epoch:  2


0.2306590646505356 0.20216489732265472
Epoch:  3


0.18318552196025847 0.20834562182426453
Epoch:  4


0.17730918526649475 0.1669945538043976
Epoch:  5


0.1744802314043045 0.16916404366493226
Epoch:  6


0.17347149014472962 0.16261107921600343
Epoch:  7


0.17060954213142396 0.1529286324977875
Epoch:  8


0.1701929438114166 0.1620824635028839
Epoch:  9


0.16887365996837617 0.1740470349788666
Epoch:  10


0.16780782759189605 0.15451464354991912
Epoch:  11


0.16675152122974396 0.1665515422821045
Epoch:  12


0.16521933794021607 0.17780246138572692
Epoch:  13


0.16535223305225372 0.1884527623653412
Epoch    13: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  14


0.1640714991092682 0.13979846835136414
Epoch:  15


0.16330355644226074 0.1406148910522461
Epoch:  16


0.16144890308380128 0.13828632831573487
Epoch:  17


0.15979752898216248 0.13763652443885804
Epoch:  18


0.16054909765720368 0.13836864233016968
Epoch:  19


0.1597773915529251 0.14178299605846406
Epoch:  20


0.1617793071269989 0.13924598097801208
Epoch:  21


0.1590350878238678 0.13733173608779908
Epoch:  22


0.15960117638111115 0.13532596081495285
Epoch:  23


0.160091290473938 0.13604622483253478
Epoch:  24


0.15954838335514068 0.1375298649072647
Epoch:  25


0.15757707953453065 0.1363651528954506
Epoch:  26


0.1584625083208084 0.1384948119521141
Epoch:  27


0.15908969938755035 0.13523911386728288
Epoch:  28


0.1592682993412018 0.1352741613984108
Epoch:  29


0.15898777306079864 0.1350048691034317
Epoch:  30


0.15875827968120576 0.1350379541516304
Epoch:  31


0.15811408936977386 0.133744278550148
Epoch:  32


0.1574134820699692 0.1347195714712143
Epoch:  33


0.15809051513671876 0.13389290869235992
Epoch:  34


0.15851114571094513 0.1361136257648468
Epoch:  35


0.15578479707241058 0.13305321782827378
Epoch:  36


0.15641193211078644 0.132169708609581
Epoch:  37


0.15673922896385192 0.13303226977586746
Epoch:  38


0.15704481184482574 0.1359554946422577
Epoch:  39


0.15683163583278656 0.13375398367643357
Epoch:  40


0.15591924369335175 0.13316280990839005
Epoch:  41


0.15636624336242677 0.13561457246541977
Epoch:  42


0.15561094224452973 0.13355146199464799
Epoch    42: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  43


0.1571801173686981 0.13152989894151687
Epoch:  44


0.15553281664848329 0.13186253160238265
Epoch:  45


0.15623825013637543 0.1316559672355652
Epoch:  46


0.15622244536876678 0.13165024816989898
Epoch:  47


0.1563190656900406 0.13101820051670074
Epoch:  48


0.15622780084609986 0.1313517615199089
Epoch:  49


0.15419765055179596 0.13158179223537445
Epoch:  50


0.15617962718009948 0.13151407539844512
Epoch:  51


0.15564875721931457 0.13153758347034455
Epoch:  52


0.1550493735074997 0.13150501102209092
Epoch:  53


0.1552851939201355 0.13143047094345092
Epoch    53: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  54


0.15564036548137664 0.1312236249446869
Epoch:  55


0.15594500660896302 0.1310297355055809
Epoch:  56


0.15522923111915587 0.13138290345668793
Epoch:  57


0.15564308762550355 0.13126472532749175
Epoch:  58


0.15495776534080505 0.13108249008655548
Epoch:  59


0.15526606321334838 0.13082368224859237
Epoch:  60


0.15555504083633423 0.13119254261255264
Epoch:  61


0.15525801181793214 0.13103603720664977
Epoch:  62


0.15624494850635529 0.13119665831327437
Epoch:  63


0.1559714764356613 0.1312747687101364
Epoch:  64


0.15650847554206848 0.1310807377099991
Epoch:  65


0.15627851843833923 0.13109218329191208
Epoch    65: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  66


0.1555172884464264 0.13126784861087798
Epoch:  67


0.15477550864219666 0.13156377673149108
Epoch:  68


0.156104633808136 0.13117221593856812
Epoch:  69


0.15390753924846648 0.1310462549328804
Epoch:  70


0.15639683306217195 0.13113510310649873
Epoch:  71


0.15549355745315552 0.1309131622314453
Epoch    71: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  72


0.15532758355140686 0.13107293397188186
Epoch:  73


0.1559099704027176 0.13120950013399124
Epoch:  74
