In [1]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [2]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [3]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [4]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [5]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [6]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [7]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [8]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [9]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [10]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [11]:
until_x = None

In [12]:
# Parameters
until_x = 10


In [13]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [14]:
# Instantiate the model
model = Task5Model(31).to(device)

In [15]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [16]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6327662870690629 0.5320009589195251
Epoch:  1


0.329580381915376 0.20617420971393585
Epoch:  2


0.19017086681481954 0.18758322511400496
Epoch:  3


0.17952435443530212 0.16198973783424922
Epoch:  4


0.17701637462989703 0.16425051433699472
Epoch:  5


0.17372521113704992 0.16188198115144456
Epoch:  6


0.17245658950225726 0.17526050124849593
Epoch:  7


0.1707124790629825 0.1546628177165985
Epoch:  8


0.16854572054502126 0.15324050400938308
Epoch:  9


0.1679845182476817 0.1507048521723066
Epoch:  10


0.16733849531895406 0.147569654243333
Epoch:  11


0.16604647040367126 0.149406584245818
Epoch:  12


0.16533089086816116 0.14748728275299072
Epoch:  13


0.1642644022767608 0.14659352813448226
Epoch:  14


0.16340832855250384 0.14253641345671245
Epoch:  15


0.16338567395468015 0.14200356709105627
Epoch:  16


0.16468627952240608 0.14566914524350846
Epoch:  17


0.16113323456532247 0.1402006202510425
Epoch:  18


0.16162325683477763 0.1425186118909291
Epoch:  19


0.1612076557971336 0.14077721323285783
Epoch:  20


0.15957197909419601 0.13920471923691885
Epoch:  21


0.16097674619507146 0.14341600665024348
Epoch:  22


0.15957794076687581 0.15367809789521353
Epoch:  23


0.1587192299398216 0.13562780086483275
Epoch:  24


0.1591272245387773 0.1319168997662408
Epoch:  25


0.15931167956945058 0.13514038707528794
Epoch:  26


0.1586310339940561 0.13403779800449098
Epoch:  27


0.15927705893645416 0.13479772955179214
Epoch:  28


0.15699061630545436 0.2321298164980752
Epoch:  29


0.15803369597808734 0.13586553079741343
Epoch:  30


0.15751706587301717 0.13296753061669214
Epoch    30: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  31


0.1557940848775812 0.12886015006474086
Epoch:  32


0.15454261286838636 0.1281344879950796
Epoch:  33


0.15435204795888952 0.12780294780220305
Epoch:  34


0.1541769073621647 0.12766661175659724
Epoch:  35


0.1542797612177359 0.12714424303599767
Epoch:  36


0.15425463383262222 0.12672735218490874
Epoch:  37


0.1545315570122487 0.12777811288833618
Epoch:  38


0.15377272021126104 0.12745881719248636
Epoch:  39


0.15268026090957024 0.12722368219069072
Epoch:  40


0.15245304840642052 0.12719769988741195
Epoch:  41


0.15238184824183182 0.12689996191433497
Epoch:  42


0.15421122916646907 0.1266572049685887
Epoch:  43


0.15266613138688578 0.12657743053776876
Epoch:  44


0.15313161547119553 0.12651655077934265
Epoch:  45


0.15358658497397965 0.12622871462787902
Epoch:  46


0.15363431299054944 0.12634656258991786
Epoch:  47


0.15224205843500188 0.12649096654994146
Epoch:  48


0.15228830640380447 0.12693633352007186
Epoch:  49


0.15231094730866923 0.12639468801873072
Epoch:  50


0.1527137877167882 0.1254639827779361
Epoch:  51


0.1522033444127521 0.12651643795626505
Epoch:  52


0.15251746330712293 0.12604858300515584
Epoch:  53


0.15141060344270757 0.12596996128559113
Epoch:  54


0.15216470610451055 0.12539816434894288
Epoch:  55


0.15069518097349116 0.12565013340541295
Epoch:  56


0.15144610566061897 0.12607387346880777
Epoch:  57


0.15162096434348338 0.1261435885514532
Epoch:  58


0.15192901765978015 0.12587058969906398
Epoch:  59


0.15101126440473506 0.1259875340121133
Epoch:  60


0.15143359150435473 0.12708030853952682
Epoch    60: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  61


0.15101401950861956 0.12600092164107732
Epoch:  62


0.1515957221791551 0.12576702662876674
Epoch:  63


0.15085725808465802 0.1261867540223258
Epoch:  64


0.15173332030708725 0.12568134282316482
Epoch:  65


0.15248303437555158 0.12544195247547968
Epoch:  66


0.15095348575630704 0.1256039642861911
Epoch    66: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  67


0.15120513938568733 0.1255269848874637
Epoch:  68


0.152142980614224 0.12542476398604258
Epoch:  69


0.1508081031812204 0.12554704078606196
Epoch:  70


0.1506136394030339 0.12561542647225515
Epoch:  71


0.1503122912065403 0.1253282789673124
Epoch:  72


0.1507114691508783 0.12519395351409912
Epoch:  73


0.15132845454924815 0.12535314900534494
Epoch:  74


0.14995535483231415 0.12561675373997008
Epoch:  75


0.15138538060961543 0.12581240279333933
Epoch:  76


0.15345483414224675 0.12552540749311447
Epoch:  77


0.15080044076249405 0.12542340478726796
Epoch:  78


0.1507781300995801 0.12536882076944625
Epoch    78: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  79


0.15036529584510908 0.1251803476895605
Epoch:  80


0.1509792498640112 0.12568533846310206
Epoch:  81


0.15107553794577316 0.1254086537020547
Epoch:  82


0.1513931010220502 0.125871941447258
Epoch:  83


0.1509810810959017 0.12551753968000412
Epoch:  84


0.15104271793687665 0.12577698166881288
Epoch:  85


0.1515088057195818 0.1252634972333908
Epoch    85: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  86


0.15084145681278124 0.1256483175924846
Epoch:  87


0.15035711752401815 0.12567794535841262
Epoch:  88


0.1515872111997089 0.12557678350380488
Epoch:  89


0.15083899208017298 0.1255903978432928
Epoch:  90


0.1507666392906292 0.12571041073117936
Epoch:  91


0.15014982747065053 0.12540911244494574
Epoch:  92


0.15186047916476791 0.12553509963410242
Epoch:  93


0.15137416245164098 0.12545186281204224
Epoch:  94


0.1509264075272792 0.12543890518801554
Epoch:  95


0.15002869270943306 0.12546601040022715
Epoch:  96


0.1520277136886442 0.12531496158667974
Epoch:  97


0.15098614507430308 0.12538140480007445
Epoch:  98


0.15175260643701297 0.12541691746030534
Epoch:  99


0.15078182679575844 0.12544392155749456
