In [1]:
# Parameters
until_x = 14


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]


def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()


class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 96, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 96, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 96, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
# Instantiate the model
model = Task5Model(31).to(device)

In [12]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [13]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 15:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6382174229621888 330.6920104980469
Epoch:  1


0.39430982947349547 3643.5033203125
Epoch:  2


0.22310764491558074 13355.6748046875
Epoch:  3


0.19532677173614502 27760.333984375
Epoch:  4


0.1899498325586319 39820.771875
Epoch:  5


0.18869991660118102 20565.3609375
Epoch:  6


0.1876363945007324 1968.18671875
Epoch     6: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  7


0.18832433700561524 1849.0787109375
Epoch:  8


0.1894451153278351 1014.687255859375
Epoch:  9


0.18888781428337098 1124.333544921875
Epoch:  10


0.18837710440158845 1132.7779541015625
Epoch:  11


0.1878433483839035 957.6716430664062
Epoch:  12


0.18907658517360687 1470.162060546875
Epoch    12: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  13


0.18747865557670593 223.2425109863281
Epoch:  14


0.18827896118164061 130.62305908203126
Epoch:  15


0.18863410234451294 142.70843200683595
Epoch:  16


0.1875753802061081 174.83385009765624
Epoch:  17


0.18864770293235777 68.83989562988282
Epoch:  18


0.18824588716030122 81.74385528564453
Epoch:  19


0.18950388848781585 51.74097595214844
Epoch:  20


0.1873872983455658 93.0583740234375
Epoch:  21


0.18828851580619813 172.20292053222656
Epoch:  22


0.18870497822761537 203.35496215820314
Epoch:  23


0.18758785963058472 251.99341735839843
Epoch:  24


0.18819220006465912 220.86837768554688
Epoch:  25


0.18839818775653838 132.4572021484375
Epoch    25: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  26


0.18794105291366578 25.42003059387207
Epoch:  27


0.1874361002445221 23.348952102661134
Epoch:  28


0.1890154778957367 10.131926918029786
Epoch:  29


0.188745796084404 14.7399320602417
Epoch:  30


0.18853672683238984 18.024016189575196
Epoch:  31


0.18782610774040223 13.749925231933593
Epoch:  32


0.187181214094162 19.190814208984374
Epoch:  33


0.1885742211341858 7.91450777053833
Epoch:  34


0.1889437311887741 9.794005393981934
Epoch:  35


0.18753934264183045 8.862879753112793
Epoch:  36


0.18849521458148957 15.610615730285645
Epoch:  37


0.18863051295280456 6.67014741897583
Epoch:  38


0.18800827443599702 10.512407112121583
Epoch:  39


0.18799781143665314 13.146387672424316
Epoch:  40


0.18727461636066436 15.602924728393555
Epoch:  41


0.18848133444786072 16.963955688476563
Epoch:  42


0.18813326358795165 13.508978462219238
Epoch:  43


0.18745202481746673 14.45422077178955
Epoch    43: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  44


0.18820249676704406 1.1042439699172975
Epoch:  45


0.18847153782844545 0.5948909997940064
Epoch:  46


0.1879124766588211 0.3408242642879486
Epoch:  47


0.18803917706012727 0.4228711545467377
Epoch:  48


0.1876607370376587 0.7980527281761169
Epoch:  49


0.18872778832912446 0.222532919049263
Epoch:  50


0.18801428973674775 0.2367408812046051
Epoch:  51


0.18845849096775055 0.2112203747034073
Epoch:  52


0.1882425481081009 0.23423457741737366
Epoch:  53


0.1891224926710129 0.25182011127471926
Epoch:  54


0.18922796487808227 0.2661343216896057
Epoch:  55


0.18863876760005951 0.24497030973434447
Epoch:  56


0.18765915870666505 0.2177056699991226
Epoch:  57


0.18757398009300233 0.22700205445289612
Epoch    57: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  58


0.18705829799175264 0.18437129259109497
Epoch:  59


0.1876346606016159 0.18419225811958312
Epoch:  60


0.18857317209243774 0.18411268889904023
Epoch:  61


0.18777912259101867 0.18381952345371247
Epoch:  62


0.1884469521045685 0.18373636305332183
Epoch:  63


0.1881443417072296 0.18356267511844634
Epoch:  64


0.18846507132053375 0.18360721170902253
Epoch:  65


0.18769276916980743 0.18356980979442597
Epoch:  66


0.18911274135112763 0.1834878295660019
Epoch:  67


0.18744805514812468 0.18343254029750825
Epoch:  68


0.18803605794906617 0.18332183361053467
Epoch:  69


0.1890181976556778 0.18351583778858185
Epoch:  70


0.18962482810020448 0.1834477126598358
Epoch:  71


0.18720038175582887 0.1835194557905197
Epoch:  72


0.18884281933307648 0.1835298478603363
Epoch:  73


0.1889664912223816 0.18338676989078523
Epoch:  74


0.18801811695098877 0.18339771926403045
Epoch:  75


0.1869291341304779 0.1833309054374695
Epoch:  76


0.1876319569349289 0.18341110050678253
Epoch:  77


0.18823807418346405 0.18336108028888704
Epoch:  78


0.18860543608665467 0.18343348503112794
Epoch:  79


0.18849362134933473 0.1834334909915924
Epoch:  80


0.187822505235672 0.1834334760904312
Epoch:  81


0.1873185592889786 0.18337040543556213
Epoch:  82


0.1881094378232956 0.18333798050880432
Epoch:  83
