In [1]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [2]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [3]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [4]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [5]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [6]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [7]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [8]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [9]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [10]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [11]:
until_x = None

In [12]:
# Parameters
until_x = 2


In [13]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [14]:
# Instantiate the model
model = Task5Model(31).to(device)

In [15]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [16]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6351700086851377 0.5018532659326281
Epoch:  1


0.3315922960236266 0.20545310420649393
Epoch:  2


0.1853505449520575 0.16077098463262832
Epoch:  3


0.17261416807367996 0.15476001373359136
Epoch:  4


0.16824054597197352 0.15039245358535222
Epoch:  5


0.16558672326642113 0.15421214593308313
Epoch:  6


0.16273091451541796 0.1412127667239734
Epoch:  7


0.16087157460483345 0.1543534334216799
Epoch:  8


0.1613913142197841 0.13736621609755925
Epoch:  9


0.15924518696359685 0.13575159651892527
Epoch:  10


0.15780773436700976 0.13490075511591776
Epoch:  11


0.15716090395643906 0.13461090517895563
Epoch:  12


0.15814641923517794 0.1345941722393036
Epoch:  13


0.1559480682418153 0.13200087738888605
Epoch:  14


0.15624376889821645 0.13751745862620218
Epoch:  15


0.15777116933384458 0.1396248127732958
Epoch:  16


0.157098235713469 0.13559681815760477
Epoch:  17


0.15528201774970904 0.12973009049892426
Epoch:  18


0.15404111308020516 0.12924292896475112
Epoch:  19


0.15525361976108035 0.13426791450807027
Epoch:  20


0.153911348532986 0.13366121692316874
Epoch:  21


0.15300519079775424 0.13202149953160966
Epoch:  22


0.1510104766568622 0.1314180482711111
Epoch:  23


0.15200654719326948 0.12770339527300426
Epoch:  24


0.15292156225926168 0.12928047989095962
Epoch:  25


0.15260972445075577 0.12827057923589433
Epoch:  26


0.151926558162715 0.12678854380335128
Epoch:  27


0.15335340515987292 0.12941239667790277
Epoch:  28


0.15091248782905373 0.1283472063285964
Epoch:  29


0.15179371108879913 0.1371114147560937
Epoch:  30


0.15393382512234352 0.15580778143235616
Epoch:  31


0.15134302707942757 0.12816388266427176
Epoch:  32


0.15090080652688 0.12867656775883266
Epoch    32: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  33


0.14931144706300786 0.12370709436280387
Epoch:  34


0.14775857490462227 0.1236084880573409
Epoch:  35


0.14726356074616714 0.12329697076763425
Epoch:  36


0.14803555929982984 0.1234274495925222
Epoch:  37


0.14697564695332502 0.12316052828516279
Epoch:  38


0.14681817671737155 0.12373299045222146
Epoch:  39


0.14705830168079687 0.1228693180850574
Epoch:  40


0.1462530908552376 0.1226984475340162
Epoch:  41


0.14639447629451752 0.12294230929442815
Epoch:  42


0.1464121736384727 0.12242263555526733
Epoch:  43


0.14611103889104482 0.12252499163150787
Epoch:  44


0.1450778104968973 0.12263033751930509
Epoch:  45


0.14748567423304995 0.12238917712654386
Epoch:  46


0.14656614935075915 0.12248451794896807
Epoch:  47


0.14522275409182986 0.12184025240795952
Epoch:  48


0.14600315206759684 0.12238576263189316
Epoch:  49


0.14551606613236504 0.12294054137808937
Epoch:  50


0.145338582428726 0.12171195873192378
Epoch:  51


0.14569909508163864 0.12185862021786827
Epoch:  52


0.14600433167573568 0.12212038891656059
Epoch:  53


0.14585345298857302 0.12224876561335155
Epoch:  54


0.1456541559986166 0.12213965931109019
Epoch:  55


0.14585008814528183 0.12205619897161211
Epoch:  56


0.14390487324547124 0.12248557380267552
Epoch    56: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  57


0.1450186480541487 0.12236561413322176
Epoch:  58


0.14289378717138962 0.12236569183213371
Epoch:  59


0.14388068463351275 0.12231329615627017
Epoch:  60


0.14477860484574293 0.12212764578206199
Epoch:  61


0.14653855401116447 0.12243605192218508
Epoch:  62


0.14349920644953446 0.12205718670572553
Epoch    62: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  63


0.14474112480073362 0.12203391641378403
Epoch:  64


0.1453140994181504 0.12213559768029622
Epoch:  65


0.144802045177769 0.12215809736933027
Epoch:  66


0.1438188887125737 0.12188167231423515
Epoch:  67


0.14546447227130066 0.12214378693274089
Epoch:  68


0.14359532175837336 0.12217983497040612
Epoch    68: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  69


0.14354427078285734 0.12224236875772476
Epoch:  70


0.14425116737146634 0.12253305315971375
Epoch:  71


0.14452300683872119 0.12203920526163918
Epoch:  72


0.1437146828786747 0.12226293342454093
Epoch:  73


0.14518222494705305 0.12215937674045563
Epoch:  74


0.1433074232694265 0.12227517579283033
Epoch    74: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  75
