In [1]:
# Parameters
until_x = -1


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]


def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()


class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 96, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 96, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 96, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
# Instantiate the model
model = Task5Model(31).to(device)

In [12]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [13]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 15:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6510431504249573 0.5241865515708923
Epoch:  1


0.4346335005760193 0.23441301882267
Epoch:  2


0.23143528282642364 0.26785407662391664
Epoch:  3


0.18788521707057954 0.17180346250534057
Epoch:  4


0.18061338663101195 0.18080154359340667
Epoch:  5


0.1776644641160965 0.1620098829269409
Epoch:  6


0.17589045345783233 0.18284887373447417
Epoch:  7


0.17511368691921234 0.16166588962078093
Epoch:  8


0.17395678222179412 0.16225016117095947
Epoch:  9


0.17323507130146026 0.1651966542005539
Epoch:  10


0.1732040202617645 13.714415740966796
Epoch:  11


0.17296156227588655 0.1646780103445053
Epoch:  12


0.17109540820121766 0.16035624742507934
Epoch:  13


0.17014532744884492 0.16665844321250917
Epoch:  14


0.16976000428199767 0.15575976073741912
Epoch:  15


0.1692025762796402 0.162337002158165
Epoch:  16


0.1695810341835022 0.1550165593624115
Epoch:  17


0.16904102325439452 0.15690107345581056
Epoch:  18


0.16620673298835753 0.15703480839729309
Epoch:  19


0.16521382987499236 0.14602656960487365
Epoch:  20


0.16649375796318056 0.14433417916297914
Epoch:  21


0.1659581768512726 0.16222269237041473
Epoch:  22


0.1630322027206421 0.1448728084564209
Epoch:  23


0.16363688468933105 0.14507425725460052
Epoch:  24


0.16296228528022766 0.1448879212141037
Epoch:  25


0.16206479489803313 0.1424469083547592
Epoch:  26


0.1623876816034317 0.14632518887519835
Epoch:  27


0.16098429918289184 0.1536359816789627
Epoch:  28


0.1621651190519333 0.14067323505878448
Epoch:  29


0.1592932367324829 0.1351315051317215
Epoch:  30


0.16033467054367065 0.13971295058727265
Epoch:  31


0.16036229252815246 0.13778279423713685
Epoch:  32


0.15935175657272338 0.13903093039989473
Epoch:  33


0.16064504146575928 0.1334162712097168
Epoch:  34


0.15787849843502044 0.13480388522148132
Epoch:  35


0.15838545441627502 0.1395157605409622
Epoch:  36


0.15787481307983398 0.1354101300239563
Epoch:  37


0.1585089695453644 0.13578988015651702
Epoch:  38


0.15887877106666565 0.13614703714847565
Epoch:  39


0.15736017882823944 0.13863543570041656
Epoch    39: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  40


0.1562841033935547 0.13139372318983078
Epoch:  41


0.15618089497089385 0.13086930960416793
Epoch:  42


0.1552931034564972 0.13040044903755188
Epoch:  43


0.15609005749225616 0.12982375770807267
Epoch:  44


0.15499694347381593 0.13068328499794007
Epoch:  45


0.15486956834793092 0.13009984642267228
Epoch:  46


0.15398866474628448 0.1298610523343086
Epoch:  47


0.15484306693077088 0.12991830706596375
Epoch:  48


0.1543469625711441 0.12994323819875717
Epoch:  49


0.15450400114059448 0.13008533120155336
Epoch    49: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  50


0.15450726568698883 0.13023998737335205
Epoch:  51


0.15468094170093535 0.13008929938077926
Epoch:  52


0.1543792688846588 0.13021473437547684
Epoch:  53


0.15400513529777526 0.12979865968227386
Epoch:  54


0.1531508356332779 0.13008664399385453
Epoch:  55


0.1534618067741394 0.12994095385074617
Epoch:  56


0.15553822934627534 0.1299735963344574
Epoch:  57


0.15475835978984834 0.12973471879959106
Epoch:  58


0.1540474098920822 0.12934472858905793
Epoch:  59


0.1552778321504593 0.12959156334400176
Epoch:  60


0.15400401890277862 0.12926141768693925
Epoch:  61


0.15346657872200012 0.12936194241046906
Epoch:  62


0.1552920573949814 0.12949206233024596
Epoch:  63


0.15528885245323182 0.12937778681516648
Epoch:  64


0.15415948927402495 0.12937302738428116
Epoch:  65


0.15513826310634612 0.12961938381195068
Epoch:  66


0.1546683591604233 0.12937046885490416
Epoch    66: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  67


0.15310205936431884 0.12931761890649796
Epoch:  68


0.15471760272979737 0.12935640513896943
Epoch:  69


0.1539503163099289 0.1292928323149681
Epoch:  70


0.15436031341552733 0.12926802784204483
Epoch:  71


0.15374603390693664 0.12931568175554276
Epoch:  72


0.15364044964313506 0.12943792343139648
Epoch    72: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  73


0.15365922272205354 0.12936654090881347
Epoch:  74


0.15499806880950928 0.1294487252831459
Epoch:  75
