In [1]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [2]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [3]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [4]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [5]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [6]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [7]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [8]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [9]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [10]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [11]:
until_x = None

In [12]:
# Parameters
until_x = 17


In [13]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [14]:
# Instantiate the model
model = Task5Model(31).to(device)

In [15]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [16]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6284521657067377 0.5542198930467878
Epoch:  1


0.3219464864279773 0.20606547381196702
Epoch:  2


0.1924938897828798 0.18815155114446366
Epoch:  3


0.1828096629800023 0.18536054023674556
Epoch:  4


0.1808730311490394 0.1816467387335641
Epoch:  5


0.1799369551040031 0.173423775604793
Epoch:  6


0.17887916234699455 0.17109085406575883
Epoch:  7


0.17844063486601855 0.17199583990233286
Epoch:  8


0.1778572491697363 0.1725329522575651
Epoch:  9


0.17796392013897766 0.16741718990462168
Epoch:  10


0.17633859652119713 0.1700656477894102
Epoch:  11


0.176252562452007 0.1722170753138406
Epoch:  12


0.17453142074314323 0.165227296096938
Epoch:  13


0.17541473741466934 0.16740734449454717
Epoch:  14


0.175296304596437 0.17023319005966187
Epoch:  15


0.17351891382320508 0.16789400790418899
Epoch:  16


0.17270924996685338 0.1600914363350187
Epoch:  17


0.17286722241221247 0.164595650775092
Epoch:  18


0.17070285009371267 0.17455092711108072
Epoch:  19


0.17166537328346357 0.16485618480614253
Epoch:  20


0.17010925347740585 0.1627359539270401
Epoch:  21


0.1678012565181062 0.1549605748483113
Epoch:  22


0.16921461071517016 0.15372550061770848
Epoch:  23


0.1671942347610319 0.1524641705410821
Epoch:  24


0.16693413781153188 0.1465052930372102
Epoch:  25


0.16770910129353805 0.14912601241043635
Epoch:  26


0.1658044131220998 0.14910719011511123
Epoch:  27


0.1655937513789615 0.1473114107336317
Epoch:  28


0.16542508151080157 0.1447158924170903
Epoch:  29


0.1651642097009195 0.15098225006035396
Epoch:  30


0.16447767534771482 0.14669778730188096
Epoch:  31


0.16351327461165352 0.14138881436416081
Epoch:  32


0.16307671287575284 0.13965829142502376
Epoch:  33


0.16321714544618451 0.14153044990130834
Epoch:  34


0.16221642212287798 0.13884558635098593
Epoch:  35


0.16205303894506917 0.14053168360676085
Epoch:  36


0.1614223050104605 0.14238107523747853
Epoch:  37


0.16189996938447695 0.13752430038792746
Epoch:  38


0.15971082972513662 0.14122168400457927
Epoch:  39


0.1594290125208932 0.13821827513831003
Epoch:  40


0.15946213176121582 0.14164902482713973
Epoch:  41


0.1596366734923543 0.13605713844299316
Epoch:  42


0.15899843339984482 0.13797786299671447
Epoch:  43


0.15749207743116328 0.13557021000555583
Epoch:  44


0.15843191984537486 0.13402220287493297
Epoch:  45


0.15883180378256617 0.13735201529094151
Epoch:  46


0.1582212331327232 0.1402945699436324
Epoch:  47


0.15792230939542926 0.13576055956738337
Epoch:  48


0.1581048075411771 0.13566278772694723
Epoch:  49


0.1567012011199384 0.1345326772757939
Epoch:  50


0.1569715404027217 0.13447269584451402
Epoch    50: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  51


0.15570322323489832 0.1321114386831011
Epoch:  52


0.15489188158834302 0.13232980881418502
Epoch:  53


0.15497767603075183 0.13103636567081725
Epoch:  54


0.1551452911383397 0.13152288964816503
Epoch:  55


0.1542249732726329 0.13111189539943421
Epoch:  56


0.1539289065309473 0.13095707978521073
Epoch:  57


0.1529606576706912 0.130916742341859
Epoch:  58


0.15453712561646024 0.13091795997960226
Epoch:  59


0.1539618642749013 0.13029337035758154
Epoch:  60


0.15414820692023715 0.1304700310741152
Epoch:  61


0.15280729371148186 0.1303314662405423
Epoch:  62


0.15420599523428324 0.13066567799874715
Epoch:  63


0.15230264373727748 0.13071042725018092
Epoch:  64


0.15395331906305776 0.13022333596433913
Epoch:  65


0.1536004349992082 0.13000607596976416
Epoch:  66


0.15359631986231417 0.1301083532827241
Epoch:  67


0.15350355489833936 0.12968054520232336
Epoch:  68


0.15361564908478711 0.12961947385753905
Epoch:  69


0.1533322060430372 0.12981328581060683
Epoch:  70


0.1532970050702224 0.12910917294876917
Epoch:  71


0.15210385983054703 0.12939955294132233
Epoch:  72


0.15326468042425206 0.12939311351094926
Epoch:  73


0.15302837257449692 0.12931253973926818
Epoch:  74


0.15434883575181704 0.12907164650303976
Epoch:  75


0.1536210690801208 0.12921498396566936
Epoch:  76


0.15345774590969086 0.12935039826801845
Epoch:  77


0.1531749456315427 0.12962126093251364
Epoch:  78


0.15266083019810753 0.1293065090264593
Epoch:  79


0.1528218388557434 0.1292985709650176
Epoch:  80


0.15321106886541522 0.12949571439198085
Epoch    80: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  81


0.1535163827844568 0.12954956080232347
Epoch:  82


0.1524808467239947 0.12958705531699316
Epoch:  83


0.15207042444396662 0.12926227386508668
Epoch:  84


0.15262436141838898 0.12934793106147222
Epoch:  85


0.15263871365302317 0.12930542337042944
Epoch:  86


0.15253897091826876 0.1294107979961804
Epoch    86: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  87


0.15247978713061358 0.12913146082844054
Epoch:  88


0.15165815723908915 0.12935967317649297
Epoch:  89


0.1526935982543069 0.12921124803168432
Epoch:  90


0.151607003163647 0.12912310553448542
Epoch:  91


0.1520138382911682 0.1290777793952397
Epoch:  92


0.15328995802917997 0.12942558101245336
Epoch    92: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  93


0.15111227373819094 0.1295532158442906
Epoch:  94


0.15320331703972173 0.1293904653617314
Epoch:  95


0.15175158147876328 0.12900735863617488
Epoch:  96


0.15242006609568726 0.12905018350907735
Epoch:  97


0.15189416021914096 0.12922706242118562
Epoch:  98


0.15264137894720645 0.12879132692302978
Epoch:  99


0.15148018099166252 0.129607087799481
