In [1]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor



In [2]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [3]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [4]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [5]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [6]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [7]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [8]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [9]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [10]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [11]:
until_x = None

In [12]:
# Parameters
until_x = 14


In [13]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [14]:
# Instantiate the model
model = Task5Model(31).to(device)

In [15]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [16]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6325988560109526 0.4807093015738896
Epoch:  1


0.32476413209696076 0.20109247735568456
Epoch:  2


0.19180322378068357 0.1954064667224884
Epoch:  3


0.18084975754892504 0.18283421865531377
Epoch:  4


0.17747820430510752 0.1743600538798741
Epoch:  5


0.1755332854148504 0.16153926508767263
Epoch:  6


0.17284816382704554 0.16684495338371821
Epoch:  7


0.1739115815710377 0.1651990839413234
Epoch:  8


0.17161813458880862 0.1647997179201671
Epoch:  9


0.17145159155935855 0.1562189566237586
Epoch:  10


0.16963833167746262 0.16500570944377355
Epoch:  11


0.170117946089925 0.1509476751089096
Epoch:  12


0.16810922807938344 0.1501070601599557
Epoch:  13


0.16804330655046412 0.15312503278255463
Epoch:  14


0.1664508287165616 0.1511068493127823
Epoch:  15


0.16688224431630727 0.1525219657591411
Epoch:  16


0.16532999317388278 0.14362202158996037
Epoch:  17


0.1635167425548708 0.14488665759563446
Epoch:  18


0.16442361635130806 0.1396830603480339
Epoch:  19


0.16289694808624886 0.13951720616647176
Epoch:  20


0.16308240350839254 0.14148617003645217
Epoch:  21


0.16154515984896067 0.14306136114256723
Epoch:  22


0.161194085672095 0.13875032322747366
Epoch:  23


0.16162455484673782 0.1745657612170492
Epoch:  24


0.16111733865093542 0.1440295683486121
Epoch:  25


0.16016194868732142 0.13674102617161615
Epoch:  26


0.15974490384797793 0.13696985585348948
Epoch:  27


0.15840737642468633 0.14384915360382625
Epoch:  28


0.15907942765467875 0.15711438762290136
Epoch:  29


0.15786934462753502 0.1365709432533809
Epoch:  30


0.15843947191496152 0.1400451809167862
Epoch:  31


0.1597127882209984 0.14276016929319926
Epoch:  32


0.15989502740872874 0.14236335349934442
Epoch:  33


0.15930248031745087 0.18603181200368063
Epoch:  34


0.15868337613505287 0.14118785091808864
Epoch:  35


0.15778561661372315 0.14173043199947902
Epoch    35: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  36


0.1563938356734611 0.13219823368958064
Epoch:  37


0.1554854910921406 0.1304037379367011
Epoch:  38


0.15542224291208628 0.1301908226949828
Epoch:  39


0.15445799118763692 0.12987000069447927
Epoch:  40


0.15502841810922366 0.13052017028842652
Epoch:  41


0.15468175628700773 0.12941850189651763
Epoch:  42


0.15496755935050346 0.12955925507204874
Epoch:  43


0.15546353725162712 0.12973748679671968
Epoch:  44


0.15576482584347595 0.1289472888622965
Epoch:  45


0.15535943693405874 0.12951849613870894
Epoch:  46


0.15633868969775536 0.12962457537651062
Epoch:  47


0.1542738027669288 0.12962822509663446
Epoch:  48


0.153710322605597 0.12906975618430547
Epoch:  49


0.15401600099898674 0.12906441305364882
Epoch:  50


0.1541794413650358 0.12909585237503052
Epoch    50: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  51


0.15351207755707405 0.1290721329195159
Epoch:  52


0.1543262737828332 0.12884437612124852
Epoch:  53


0.15417550343113975 0.12921461462974548
Epoch:  54


0.1542782803645005 0.12913778424263
Epoch:  55


0.15402194736777125 0.12916436684983118
Epoch:  56


0.15376432078915672 0.12867906902517592
Epoch:  57


0.15386971952141942 0.1285313069820404
Epoch:  58


0.1530761307961232 0.12887388467788696
Epoch:  59


0.15266033161330866 0.12904516926833562
Epoch:  60


0.15401173805868304 0.12902927824429103
Epoch:  61


0.15299618163624326 0.12920027119772776
Epoch:  62


0.15399026951274356 0.1292435164962496
Epoch:  63


0.15492166941230362 0.12887400069407054
Epoch    63: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  64


0.1523598597661869 0.1294860371521541
Epoch:  65


0.15353084577096476 0.1288063728383609
Epoch:  66


0.15461262617562269 0.12876319885253906
Epoch:  67


0.15325604016716415 0.12934474859918868
Epoch:  68


0.1528246511478682 0.1287836730480194
Epoch:  69


0.15624108830013791 0.1288504653743335
Epoch    69: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  70


0.15405054470977267 0.12876178111348832
Epoch:  71


0.15482253359781728 0.12865486847502844
Epoch:  72


0.15365071312801257 0.12914313269512995
Epoch:  73


0.15341053822556058 0.1288454947727067
Epoch:  74


0.15465930265349312 0.12887008594615118
Epoch:  75


0.15278922061662417 0.12901751909937179
Epoch    75: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  76


0.15370054381924705 0.12885501555034093
Epoch:  77


0.15293741145649473 0.12947231318269456
Epoch:  78


0.1545603726361249 0.1293208407504218
Epoch:  79


0.15280427924684575 0.12884737764086043
Epoch:  80


0.15347006312898687 0.12878870431865966
Epoch:  81


0.15351563410179034 0.12921797590596334
Epoch:  82
