In [1]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [2]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [3]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [4]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [5]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [6]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [7]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [8]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [9]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [10]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [11]:
until_x = None

In [12]:
# Parameters
until_x = 10


In [13]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [14]:
# Instantiate the model
model = Task5Model(31).to(device)

In [15]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [16]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6362075370711249 0.49663811922073364
Epoch:  1


0.3298353564900321 0.20512679644993373
Epoch:  2


0.19251239783055074 0.17591652699879237
Epoch:  3


0.1808876358979457 0.16897263484341757
Epoch:  4


0.17680451555832014 0.1802133321762085
Epoch:  5


0.17577521422424833 0.16172278566019876
Epoch:  6


0.173763977917465 0.1632773748465947
Epoch:  7


0.17282209726604256 0.15721910340445383
Epoch:  8


0.17174241913331523 0.15153226256370544
Epoch:  9


0.16894391541545456 0.15488938135760172
Epoch:  10


0.168595081245577 0.15260872031961167
Epoch:  11


0.16858312165414965 0.154229394027165
Epoch:  12


0.16738676904021083 0.14576663502625056
Epoch:  13


0.16476128794051506 0.15179778635501862
Epoch:  14


0.1635152294829085 0.1577553004026413
Epoch:  15


0.1635896627967422 0.14768612597669875
Epoch:  16


0.16249599126545158 0.14113522320985794
Epoch:  17


0.16242000259257652 0.14156217447349004
Epoch:  18


0.16081056844543767 0.14085973054170609
Epoch:  19


0.16072426655808011 0.13600171038082667
Epoch:  20


0.16088979872497353 0.13811585307121277
Epoch:  21


0.15936302695725416 0.13723456114530563
Epoch:  22


0.15942364729739525 0.14436005162341253
Epoch:  23


0.16163514473953763 0.1352735704609326
Epoch:  24


0.15879488394067093 0.1389506193143981
Epoch:  25


0.15742270567932645 0.13757859063999994
Epoch:  26


0.15733156977473078 0.13801545862640655
Epoch:  27


0.15857349134780266 0.13405935572726385
Epoch:  28


0.15841126562775792 0.13390897533723287
Epoch:  29


0.15699403672604947 0.13238484199557984
Epoch:  30


0.15766878989902702 0.13673093702111924
Epoch:  31


0.15682990607377645 0.13402629111494338
Epoch:  32


0.15600605550650004 0.1337359451821872
Epoch:  33


0.1565117098995157 0.13128790365798132
Epoch:  34


0.15494593575194077 0.1321087373154504
Epoch:  35


0.15511087912159996 0.13366257292883738
Epoch:  36


0.15512116092282371 0.12927702175719397
Epoch:  37


0.1549952517490129 0.1311911621264049
Epoch:  38


0.15444069456409765 0.1312386297753879
Epoch:  39


0.15467729157692678 0.1332348265818187
Epoch:  40


0.15449875029357704 0.1313552867088999
Epoch:  41


0.15252762992639798 0.12836797535419464
Epoch:  42


0.1536625591484276 0.12917181636605943
Epoch:  43


0.15372698129834356 0.13080901971885137
Epoch:  44


0.15435744177650762 0.12876040169170924
Epoch:  45


0.15407816421341253 0.13394454228026526
Epoch:  46


0.15261601274077957 0.1288554466196469
Epoch:  47


0.15318712712945165 0.1314775464790208
Epoch    47: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  48


0.15230975320210327 0.12586419603654317
Epoch:  49


0.1493725756535659 0.12711025029420853
Epoch:  50


0.150137955675254 0.12927659388099397
Epoch:  51


0.149781127233763 0.1255463265946933
Epoch:  52


0.15061169822473783 0.125258526631764
Epoch:  53


0.15063217161475 0.12593182282788412
Epoch:  54


0.1489594635125753 0.12529902585915156
Epoch:  55


0.14997356405129303 0.12745630315371922
Epoch:  56


0.15025421574309067 0.12451078423431941
Epoch:  57


0.14810357182412534 0.1253571127142225
Epoch:  58


0.15037464008137985 0.12479396696601595
Epoch:  59


0.14932686291836403 0.12429924522127424
Epoch:  60


0.14893654311025464 0.12409307701247078
Epoch:  61


0.1495521982779374 0.12526711182934896
Epoch:  62


0.14878670951804598 0.12523712004934037
Epoch:  63


0.14777545510111628 0.12538360697882517
Epoch:  64


0.14814496926359227 0.12446057690041405
Epoch:  65


0.14804033854523221 0.12457817792892456
Epoch:  66


0.14688995359717189 0.12440786510705948
Epoch    66: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  67


0.1476458448010522 0.1240798607468605
Epoch:  68


0.1471856816394909 0.12409858937774386
Epoch:  69


0.14876344397261337 0.12425860975469862
Epoch:  70


0.1488285580196896 0.12433905793087822
Epoch:  71


0.14871819236794034 0.1246914171746799
Epoch:  72


0.1485556078118247 0.12432308069297246
Epoch:  73


0.1479834926289481 0.12410481061254229
Epoch    73: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  74


0.1480319713418548 0.12404115285192217
Epoch:  75


0.14754081577868075 0.12416106675352369
Epoch:  76


0.14750606425710627 0.1244560820715768
Epoch:  77


0.14940343877753695 0.12433746137789317
Epoch:  78


0.14783678546145157 0.12422709379877363
Epoch:  79


0.14876103360910672 0.12433859493051257
Epoch:  80


0.14847400985859535 0.12440924346446991
Epoch    80: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  81


0.14823995570878726 0.12444573960133962
Epoch:  82


0.14897590392344706 0.12411191101585116
Epoch:  83


0.1478890901481783 0.12429895464863096
Epoch:  84


0.14835303015000112 0.1242616634283747
Epoch:  85


0.1472374929769619 0.12421737717730659
Epoch:  86


0.1486338151467813 0.12412127320255552
Epoch    86: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  87


0.1486554568683779 0.12428176828793117
Epoch:  88


0.1491926441321502 0.12445299114499773
Epoch:  89


0.14839337725897092 0.1242770573922566
Epoch:  90


0.14875262814599113 0.12440962663718633
Epoch:  91


0.1479068930084641 0.12440648994275502
Epoch:  92


0.14836424508610288 0.12397630938461848
Epoch:  93


0.1481304724474211 0.12415873897927147
Epoch:  94


0.14888130168657046 0.12404316450868334
Epoch:  95


0.14847418303425247 0.12433065793343953
Epoch:  96


0.14783708026280273 0.12414000396217618
Epoch:  97


0.1482570972797033 0.12434203177690506
Epoch:  98


0.14762783453271194 0.1246107007775988
Epoch:  99


0.1479747520910727 0.1244892765368734
