In [1]:
# Parameters
until_x = 13


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [12]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [13]:
# Instantiate the model
model = Task5Model(31).to(device)

In [14]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6062931378145475 0.4254253251211984
Epoch:  1


0.2697611853883073 0.18441648142678396
Epoch:  2


0.17186162882560008 0.14790681110961096
Epoch:  3


0.1633653532008867 0.15100895506995066
Epoch:  4


0.15937314001289574 0.1430238772715841
Epoch:  5


0.15706137869809125 0.13633699608700617
Epoch:  6


0.15643733216298594 0.1320842770593507
Epoch:  7


0.15468960758802053 0.13871236039059504
Epoch:  8


0.15444805090491837 0.13022176495620183
Epoch:  9


0.15510902493386655 0.13200256122010096
Epoch:  10


0.15294153585627274 0.13923072602067674
Epoch:  11


0.15183155198354978 0.12859525850840978
Epoch:  12


0.15146113046117732 0.12786725695644105
Epoch:  13


0.1511573360578434 0.1263951967869486
Epoch:  14


0.1501130050904042 0.12758283104215348
Epoch:  15


0.15039579006465706 0.1279729072536741
Epoch:  16


0.15004182627072205 0.12630873599222728
Epoch:  17


0.1505607358507208 0.1278770693710872
Epoch:  18


0.14927582241393425 0.13029522448778152
Epoch:  19


0.1487966630909894 0.13180262808288848
Epoch:  20


0.1483745220545176 0.12474064741815839
Epoch:  21


0.14850909323305697 0.1285064113991601
Epoch:  22


0.14935900472305916 0.12820823916367122
Epoch:  23


0.1479082904957436 0.12451855731861931
Epoch:  24


0.1481831971857999 0.1252713661108698
Epoch:  25


0.14768948184477315 0.1265670018536704
Epoch:  26


0.14791985339409597 0.12804593784468515
Epoch:  27


0.1460154801607132 0.12529792423759187
Epoch:  28


0.14666171492757024 0.13193826803139277
Epoch:  29


0.1475678521233636 0.12936239796025412
Epoch    29: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  30


0.1454387170237464 0.12202061819178718
Epoch:  31


0.14457830867251834 0.12127651274204254
Epoch:  32


0.14374717022921588 0.12116002291440964
Epoch:  33


0.1428301994865005 0.1214674540928432
Epoch:  34


0.14338471680074125 0.1205243808882577
Epoch:  35


0.14212812402763883 0.12042011427027839
Epoch:  36


0.14222811605479266 0.12070799406085696
Epoch:  37


0.1427435605107127 0.12032347172498703
Epoch:  38


0.14254922339239637 0.12028477021626063
Epoch:  39


0.14318478026905576 0.12037988007068634
Epoch:  40


0.14156098099979195 0.12008683170591082
Epoch:  41


0.14174219643747485 0.12061384320259094
Epoch:  42


0.1409859677424302 0.12020989400999886
Epoch:  43


0.14279114877855456 0.12034347121204649
Epoch:  44


0.14142403207920692 0.12026847898960114
Epoch:  45


0.14119876478169416 0.1200526665363993
Epoch:  46


0.1410529524893374 0.1205233803817204
Epoch:  47


0.14161232939443072 0.12014547841889518
Epoch:  48


0.14157463811539314 0.1200716570019722
Epoch:  49


0.14102174462498845 0.11989461311272212
Epoch:  50


0.14099811621614405 0.12013968293155942
Epoch:  51


0.14211662919134707 0.12021641113928386
Epoch:  52


0.14078123062043577 0.12013187791619982
Epoch:  53


0.1406052076333278 0.11990956855671746
Epoch:  54


0.14156309656194738 0.12076196606670107
Epoch:  55


0.14072853286524076 0.12088334134646825
Epoch    55: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  56


0.1405459678656346 0.12058811954089574
Epoch:  57


0.13917122338269208 0.12052650962557111
Epoch:  58


0.13965491064496943 0.12047777112041201
Epoch:  59


0.1402811335550772 0.12028955348900386
Epoch:  60


0.14079417328576785 0.120006905070373
Epoch:  61


0.1395535521410607 0.1200799931372915
Epoch    61: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  62


0.13966696004609805 0.12006769329309464
Epoch:  63


0.14065236094835643 0.12015121110848018
Epoch:  64


0.13935057295335307 0.12014820958886828
Epoch:  65


0.1385265882756259 0.12042489967175893
Epoch:  66


0.1413276960720887 0.11995218374899455
Epoch:  67


0.13933282325396668 0.1200575658253261
Epoch    67: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  68


0.1402337283298776 0.12013525302921023
Epoch:  69


0.139786688057152 0.12025306480271476
Epoch:  70


0.13986025770773758 0.12017686665058136
Epoch:  71


0.14020451218695254 0.12023022664444787
Epoch:  72


0.13921342951220436 0.12021854413407189
Epoch:  73


0.14001190380470172 0.12007961847952434
Epoch    73: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  74
