In [1]:
# Parameters
until_x = 16


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 96, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 96, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 96, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [12]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [13]:
# Instantiate the model
model = Task5Model(31).to(device)

In [14]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6545269060134887 0.5851712822914124
Epoch:  1


0.4400589144229889 0.27848377227783205
Epoch:  2


0.22556212425231933 0.1653562605381012
Epoch:  3


0.17500656962394714 0.14898415207862853
Epoch:  4


0.1632868903875351 0.14290272295475007
Epoch:  5


0.16035100758075715 0.1385274350643158
Epoch:  6


0.15973847568035127 0.1401718705892563
Epoch:  7


0.15597851872444152 0.13651155382394792
Epoch:  8


0.1556597137451172 0.13180752247571945
Epoch:  9


0.1540030914545059 0.13504136204719544
Epoch:  10


0.15457018256187438 0.13353359401226045
Epoch:  11


0.15156517922878265 0.12905189990997315
Epoch:  12


0.15284669160842895 0.1304423674941063
Epoch:  13


0.15241383969783784 0.1282862901687622
Epoch:  14


0.15158056557178498 0.12798693925142288
Epoch:  15


0.14973460376262665 0.12629009634256363
Epoch:  16


0.1504368621110916 0.12754528522491454
Epoch:  17


0.15091031849384307 0.1277866005897522
Epoch:  18


0.1534587985277176 0.12941251695156097
Epoch:  19


0.15058879017829896 0.13405864238739013
Epoch:  20


0.14827007532119751 0.12614037543535234
Epoch:  21


0.1490306943655014 0.128251251578331
Epoch:  22


0.14982271552085877 0.12923093438148497
Epoch:  23


0.14788673996925353 0.12911307960748672
Epoch:  24


0.14836664795875548 0.12853735387325288
Epoch:  25


0.14758933186531067 0.12908360064029695
Epoch:  26


0.14739710390567778 0.12663637101650238
Epoch    26: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  27


0.14736103177070617 0.12320675402879715
Epoch:  28


0.14554224789142609 0.12320365756750107
Epoch:  29


0.14538753807544708 0.12283097803592682
Epoch:  30


0.14425098299980163 0.12254233360290527
Epoch:  31


0.1444164425134659 0.12274373024702072
Epoch:  32


0.1437448400259018 0.12236283123493194
Epoch:  33


0.14506788194179535 0.12270762324333191
Epoch:  34


0.14334927916526793 0.12175154685974121
Epoch:  35


0.14457813739776612 0.12279255092144012
Epoch:  36


0.14307576537132263 0.12287599742412567
Epoch:  37


0.14325389981269837 0.12193401753902436
Epoch:  38


0.1431467789411545 0.12232189029455184
Epoch:  39


0.1422031056880951 0.12179148346185684
Epoch:  40


0.14330228090286254 0.12155696004629135
Epoch:  41


0.14328051686286927 0.12176433205604553
Epoch:  42


0.14311105489730835 0.12125775814056397
Epoch:  43


0.14123493790626526 0.12157925963401794
Epoch:  44


0.14187004625797273 0.12095510810613633
Epoch:  45


0.14378998279571534 0.12215539515018463
Epoch:  46


0.14277239739894867 0.12266248911619186
Epoch:  47


0.1422296917438507 0.12223814874887466
Epoch:  48


0.14217992424964904 0.12239362597465515
Epoch:  49


0.1413131070137024 0.12191512584686279
Epoch:  50


0.14178017973899842 0.12201227694749832
Epoch    50: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  51


0.1424027293920517 0.1217030093073845
Epoch:  52


0.14174768924713135 0.12202421724796295
Epoch:  53


0.1413494473695755 0.1220688447356224
Epoch:  54


0.14078367173671721 0.12189284414052963
Epoch:  55


0.14076528310775757 0.12198401540517807
Epoch:  56


0.1412131029367447 0.12192536294460296
Epoch    56: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  57


0.1414941930770874 0.12200787216424942
Epoch:  58


0.14053038954734803 0.12208198457956314
Epoch:  59


0.14123001873493193 0.12192733734846115
Epoch:  60


0.1421432065963745 0.12202358841896058
Epoch:  61


0.1408602398633957 0.12221313416957855
Epoch:  62


0.14050115823745726 0.12192574590444565
Epoch    62: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  63


0.14134932160377503 0.1221013069152832
Epoch:  64


0.14150253295898438 0.12215630561113358
Epoch:  65


0.14185467958450318 0.12184474468231202
Epoch:  66


0.1419853276014328 0.12220243960618973
Epoch:  67


0.14126621842384338 0.12212043404579162
Epoch:  68


0.1419730478525162 0.12203460931777954
Epoch    68: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  69
