In [1]:
# Parameters
until_x = 1


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [12]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [13]:
# Instantiate the model
model = Task5Model(31).to(device)

In [14]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6151559054851532 0.4423244765826634
Epoch:  1


0.2876567909040967 0.1981276422739029
Epoch:  2


0.19033270429920507 0.20681936187403543
Epoch:  3


0.18472215815170392 0.17374775452273233
Epoch:  4


0.1810339914785849 0.16994118690490723
Epoch:  5


0.17927119216403445 6.538023063114712
Epoch:  6


0.17740113469394478 0.1738112441131047
Epoch:  7


0.17682196280440768 0.1617745586803981
Epoch:  8


0.1750335749742147 0.15798657281058176
Epoch:  9


0.1759481691830867 0.1615989931992122
Epoch:  10


0.17368559740685127 0.1573089701788766
Epoch:  11


0.17341000043057106 0.15565712111336844
Epoch:  12


0.17132928000914083 0.15181608498096466
Epoch:  13


0.17016440267498428 0.15458111039229802
Epoch:  14


0.17115503066294901 0.15533763808863504
Epoch:  15


0.1700082377002046 0.15163375437259674
Epoch:  16


0.16794769707563761 0.14705672647271836
Epoch:  17


0.16696792518770373 0.15162513937268937
Epoch:  18


0.1661520882232769 0.15051514548914774
Epoch:  19


0.16544170959575757 0.15233320210661208
Epoch:  20


0.16395819710718618 0.15341079235076904
Epoch:  21


0.16332515188165614 0.14739163432802474
Epoch:  22


0.1607639346573804 0.1412774303129741
Epoch:  23


0.16114839871187467 0.14301399460860661
Epoch:  24


0.15980211423861013 0.14316203870943614
Epoch:  25


0.1612033332521851 0.13866992507662093
Epoch:  26


0.1603830944041948 0.1368569199528013
Epoch:  27


0.15958185453672666 0.1346737433757101
Epoch:  28


0.15963352532000155 0.13458471106631414
Epoch:  29


0.15876757293134122 0.13605685532093048
Epoch:  30


0.15834877539325404 0.13626798135893686
Epoch:  31


0.15754880897096685 0.13450103678873607
Epoch:  32


0.15772706793772207 0.13261159935167857
Epoch:  33


0.1559730897078643 0.13503631417240416
Epoch:  34


0.15761352551949992 0.13151487495218003
Epoch:  35


0.15586685409417023 0.1303795310003417
Epoch:  36


0.1558512344553664 0.13286165254456656
Epoch:  37


0.15518447595673637 0.13327227107116155
Epoch:  38


0.1549672778393771 0.1312959481562887
Epoch:  39


0.15556337704529632 0.1309399881533214
Epoch:  40


0.155314811983624 0.13114765605756215
Epoch:  41


0.1537608749963142 0.12939089110919408
Epoch:  42


0.1532579970520896 0.128399524305548
Epoch:  43


0.15462214962856188 0.12924428177731379
Epoch:  44


0.15392034963981524 0.12882648301976068
Epoch:  45


0.15341349711289276 0.12881227795566833
Epoch:  46


0.15415061003453023 0.13214451287473952
Epoch:  47


0.15247146865806063 0.12876681770597184
Epoch:  48


0.15379057502424395 0.12893706666571753
Epoch    48: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  49


0.15218753347525726 0.12690621614456177
Epoch:  50


0.15171840907754125 0.12593102987323487
Epoch:  51


0.15133877098560333 0.12589982684169496
Epoch:  52


0.1511775343804746 0.12586630880832672
Epoch:  53


0.1501420716176162 0.125041341142995
Epoch:  54


0.14983785434349164 0.1252447749887194
Epoch:  55


0.15005656070000417 0.12467186046498162
Epoch:  56


0.15097684836065448 0.12512779874461039
Epoch:  57


0.15169624984264374 0.1252616516181401
Epoch:  58


0.1496105749864836 0.12509571015834808
Epoch:  59


0.14915420235814275 0.12490877296243395
Epoch:  60


0.15055078711058642 0.12484517906393323
Epoch:  61


0.14843712142995885 0.12474855141980308
Epoch    61: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  62


0.1496453289244626 0.12476155800478798
Epoch:  63


0.1496338683205682 0.1245573205607278
Epoch:  64


0.15042757665788806 0.12454637885093689
Epoch:  65


0.14923460056652893 0.12448444323880332
Epoch:  66


0.14909223648341927 0.12444923605237689
Epoch:  67


0.15126640128122792 0.12452087551355362
Epoch:  68


0.1488974569617091 0.12444631223167692
Epoch:  69


0.14927481235684575 0.12464982803378787
Epoch:  70


0.1499554917619035 0.1246040261217526
Epoch:  71


0.15000578237546458 0.12448882524456296
Epoch:  72


0.14948493000623342 0.1242021034870829
Epoch:  73


0.15001983417047038 0.12435571104288101
Epoch:  74


0.14859810874268814 0.12427292125565666
Epoch:  75


0.1503316776172535 0.12442803382873535
Epoch:  76


0.14953046030289419 0.1246280510510717
Epoch:  77


0.14921849160581022 0.12434682143586022
Epoch:  78


0.14912568556295858 0.12425626282181058
Epoch    78: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  79


0.14897187980445656 0.12427520858389991
Epoch:  80


0.14950239698629122 0.12421639689377376
Epoch:  81


0.14831443292063637 0.12451875422682081
Epoch:  82


0.14974261739769498 0.12460429540702275
Epoch:  83


0.14930450876016874 0.12447793994631086
Epoch:  84


0.14995053531350316 0.12420920389039176
Epoch    84: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  85


0.14989352065163689 0.12432972767523356
Epoch:  86


0.14925820320039182 0.12458160838910512
Epoch:  87


0.14919626269791578 0.12452367373875209
Epoch:  88


0.15034433152224566 0.12436025163957051
Epoch:  89


0.1504714231233339 0.12424180975982121
Epoch:  90


0.14993766473757253 0.12406101929289955
Epoch:  91


0.14975618067625407 0.124345991228308
Epoch:  92


0.14987988206180367 0.12427189413990293
Epoch:  93


0.15074849692550865 0.12449621834925242
Epoch:  94


0.1486192776544674 0.12436490931681224
Epoch:  95


0.14872012347788424 0.12425863636391503
Epoch:  96


0.1508110584439458 0.12456500104495458
Epoch    96: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  97


0.15030251362839261 0.12426525886569705
Epoch:  98


0.14861379765175484 0.12484826147556305
Epoch:  99


0.14947511336287936 0.12447164207696915
