In [1]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [2]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [3]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [4]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [5]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [6]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [7]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [8]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [9]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [10]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [11]:
until_x = None

In [12]:
# Parameters
until_x = 18


In [13]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [14]:
# Instantiate the model
model = Task5Model(31).to(device)

In [15]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [16]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6271815348315883 0.4726466195923941
Epoch:  1


0.31557346276334813 0.20260353812149592
Epoch:  2


0.19170958770288005 0.18621987104415894
Epoch:  3


0.1825923102127539 0.1814122051000595
Epoch:  4


0.18169298848590335 0.21444301732948848
Epoch:  5


0.17782729139199127 0.1763803937605449
Epoch:  6


0.17541822911919774 0.1620198062488011
Epoch:  7


0.17390275484806783 0.16339303765978133
Epoch:  8


0.17498078821478663 0.17454523486750467
Epoch:  9


0.17318530179358818 0.15998118477208273
Epoch:  10


0.1730389776262077 0.17830651359898703
Epoch:  11


0.17103931710526749 0.16181006814752305
Epoch:  12


0.17069458075471827 0.1537052776132311
Epoch:  13


0.1683115387285078 0.15295656876904623
Epoch:  14


0.16829420142882578 0.15007738981928145
Epoch:  15


0.16844379458878492 0.15423414324011123
Epoch:  16


0.16776490493400678 0.14631557571036474
Epoch:  17


0.16639319263600014 0.161609981741224
Epoch:  18


0.1664701885468251 0.15900409647396632
Epoch:  19


0.16681820677744374 0.14969751877444132
Epoch:  20


0.16552605338998744 0.14290088734456471
Epoch:  21


0.1650820012028153 0.14470842906406947
Epoch:  22


0.16510529856424075 0.14263021626642772
Epoch:  23


0.1643164202168181 0.14222401806286403
Epoch:  24


0.16167214673918648 0.1404589212366513
Epoch:  25


0.16217648419173988 0.14129008459193365
Epoch:  26


0.16152385239665573 0.145247255052839
Epoch:  27


0.1622499215441781 0.1405923558132989
Epoch:  28


0.1609138342979792 0.14381604215928487
Epoch:  29


0.16018858350612022 0.1772365676505225
Epoch:  30


0.16053726665071538 0.1419819093176297
Epoch    30: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  31


0.1594297781183913 0.13412796599524363
Epoch:  32


0.15970841533428914 0.13347182742186955
Epoch:  33


0.15803729923995766 0.13416643227849687
Epoch:  34


0.15799871489808365 0.1334086177604539
Epoch:  35


0.15800826050139763 0.13300588726997375
Epoch:  36


0.15707978968684738 0.13224666565656662
Epoch:  37


0.15621413411320867 0.13284459177936828
Epoch:  38


0.1572800761944539 0.13214103132486343
Epoch:  39


0.15767694284787048 0.13229272727455413
Epoch:  40


0.15816716769257108 0.13228770664760045
Epoch:  41


0.15715484482211037 0.13249963096209935
Epoch:  42


0.1568230185154322 0.1319857154573713
Epoch:  43


0.15797376350776568 0.13239382739577973
Epoch:  44


0.15688888245337718 0.1312831523162978
Epoch:  45


0.15653597664188695 0.13186786323785782
Epoch:  46


0.15614018609394897 0.13137205583708628
Epoch:  47


0.15730984992272146 0.13159975303070887
Epoch:  48


0.15727876650320516 0.13161809955324447
Epoch:  49


0.1579349931027438 0.13220072644097464
Epoch:  50


0.15742632263415568 0.13178438054663794
Epoch    50: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  51


0.15739875951328794 0.1314262364591871
Epoch:  52


0.15799498396950798 0.13151880460126059
Epoch:  53


0.1547003307858029 0.1316223985382489
Epoch:  54


0.15494329421906858 0.13158579170703888
Epoch:  55


0.15610707490830808 0.13111615180969238
Epoch:  56


0.1564260664823893 0.13139977306127548
Epoch:  57


0.15539208739190488 0.1311011804001672
Epoch:  58


0.15611219849135424 0.13162754050322942
Epoch:  59


0.15588759328867938 0.131580294242927
Epoch:  60


0.15640585406406507 0.1312047275049346
Epoch:  61


0.15786351505163554 0.13137432719979966
Epoch:  62


0.15577129697477496 0.13115462767226355
Epoch:  63


0.15614346153027303 0.1311943115932601
Epoch    63: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  64


0.15607414092566516 0.13128303842885153
Epoch:  65


0.15645421356768222 0.1308250448533467
Epoch:  66


0.15649701010536504 0.1310010233095714
Epoch:  67


0.1572235943497838 0.13128251369510377
Epoch:  68


0.15658445736846408 0.1315117340002741
Epoch:  69


0.15726866029404304 0.1310753332717078
Epoch:  70


0.15584559215081706 0.13125888471092498
Epoch:  71


0.1560213827603572 0.13126343275819505
Epoch    71: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  72


0.1560205648074279 0.13122353915657317
Epoch:  73


0.15629888023879077 0.13140677873577392
Epoch:  74


0.15640224717758797 0.13077775601829802
Epoch:  75


0.15672121217121948 0.13110304410968507
Epoch:  76


0.15773298128231153 0.13131653943232127
Epoch:  77


0.1555066914171786 0.13134055584669113
Epoch:  78


0.1545720072211446 0.13100821099111012
Epoch:  79


0.15632784608248118 0.13131139427423477
Epoch:  80


0.15717415269967672 0.13098128246409552
Epoch    80: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  81


0.15683742028635902 0.1317380326134818
Epoch:  82


0.15504948553201314 0.13119672345263617
Epoch:  83


0.15560262066286965 0.13147436720984323
Epoch:  84


0.15649344913057378 0.13117741580520356
Epoch:  85


0.15689598789086212 0.13106238203389303
Epoch:  86


0.15716137193344734 0.13147631819759095
Epoch:  87


0.15499407939008764 0.13104680819170816
Epoch:  88


0.15633732765107541 0.13105632471186773
Epoch:  89


0.1567162065892606 0.13121707205261504
Epoch:  90


0.1571056931405454 0.13123801350593567
Epoch:  91


0.15667188731399742 0.1313322282263211
Epoch:  92


0.1566009634249919 0.13080785317080362
Epoch:  93


0.1553120045243083 0.13096965849399567
Epoch:  94


0.15618179899615212 0.13113390228578023
Epoch:  95


0.15621314781743126 0.13137538518224443
Epoch:  96


0.1553025571881114 0.13111116737127304
Epoch:  97


0.15475863741861806 0.13109158830983297
Epoch:  98


0.15670494211686625 0.13114125494446074
Epoch:  99
