In [1]:
# Parameters
until_x = 8


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 96, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 96, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 96, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [12]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [13]:
# Instantiate the model
model = Task5Model(31).to(device)

In [14]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6547028112411499 0.6100871205329895
Epoch:  1


0.4299746346473694 0.2868035674095154
Epoch:  2


0.22102107882499694 0.18576436042785643
Epoch:  3


0.17744070112705232 0.17195015549659728
Epoch:  4


0.1679113972187042 0.15624642074108125
Epoch:  5


0.16358402073383332 0.18241272270679473
Epoch:  6


0.16246035933494568 0.15743678212165832
Epoch:  7


0.15988361179828645 0.14904858469963073
Epoch:  8


0.15799904227256775 0.14154568314552307
Epoch:  9


0.15593400955200196 0.15365981459617614
Epoch:  10


0.15510185539722443 0.13493596911430358
Epoch:  11


0.15380106270313262 0.13374371081590652
Epoch:  12


0.15258139252662659 0.13093588203191758
Epoch:  13


0.15426074981689453 0.1300164818763733
Epoch:  14


0.15243081688880922 0.13114407509565354
Epoch:  15


0.1517045396566391 0.1295907214283943
Epoch:  16


0.15161058843135833 0.13393742442131043
Epoch:  17


0.15099238097667694 0.13237356692552565
Epoch:  18


0.15171542406082153 0.14768484532833098
Epoch:  19


0.15082812905311585 0.13044968396425247
Epoch:  20


0.1498231327533722 0.12967312186956406
Epoch:  21


0.1501512336730957 0.12975349128246308
Epoch    21: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  22


0.1484696352481842 0.12487250864505768
Epoch:  23


0.14815812826156616 0.12539795339107512
Epoch:  24


0.14763683199882507 0.1253590241074562
Epoch:  25


0.14760584235191346 0.12515412867069245
Epoch:  26


0.1466414773464203 0.12424067407846451
Epoch:  27


0.14710260033607483 0.12422017753124237
Epoch:  28


0.1483535224199295 0.12407328486442566
Epoch:  29


0.1462991267442703 0.12443678975105285
Epoch:  30


0.1451125568151474 0.12388677448034287
Epoch:  31


0.14565772652626038 0.12410460710525513
Epoch:  32


0.1459022033214569 0.12403839975595474
Epoch:  33


0.14625074625015258 0.12394070774316787
Epoch:  34


0.14593218505382538 0.12398508340120315
Epoch:  35


0.14612395346164703 0.12374519258737564
Epoch:  36


0.1463016128540039 0.12342000156641006
Epoch:  37


0.1457749432325363 0.12409847527742386
Epoch:  38


0.14564402163028717 0.1235476478934288
Epoch:  39


0.14595740914344787 0.12377070486545563
Epoch:  40


0.14599685192108156 0.12396318316459656
Epoch:  41


0.14668907046318055 0.12440060526132583
Epoch:  42


0.1439524358510971 0.12331307083368301
Epoch:  43


0.14472788095474243 0.12434803545475007
Epoch:  44


0.1454545259475708 0.12317954450845718
Epoch:  45


0.1453282779455185 0.12343730032444
Epoch:  46


0.1439456307888031 0.12343295216560364
Epoch:  47


0.144935063123703 0.12373304218053818
Epoch:  48


0.14437031686306 0.12401542067527771
Epoch:  49


0.14435523390769958 0.12450489103794098
Epoch:  50


0.14372035086154938 0.12362837493419647
Epoch    50: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  51


0.14434853792190552 0.1235044851899147
Epoch:  52


0.14446037352085114 0.12323648631572723
Epoch:  53


0.14494448006153107 0.12308465242385865
Epoch:  54


0.14422557711601258 0.12304691076278687
Epoch:  55


0.14369018673896788 0.12316633611917496
Epoch:  56


0.14351142406463624 0.12332393527030945
Epoch:  57


0.14441049456596375 0.12331050783395767
Epoch:  58


0.14473244190216064 0.12336587756872178
Epoch:  59


0.14341238677501678 0.12336071878671646
Epoch:  60


0.14379007935523988 0.12330330312252044
Epoch    60: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  61


0.1444043254852295 0.12323810011148453
Epoch:  62


0.14440339267253877 0.12325513064861297
Epoch:  63


0.14389639496803283 0.12307260334491729
Epoch:  64


0.14327495992183686 0.12324106693267822
Epoch:  65


0.14342283964157104 0.12332886755466461
Epoch:  66


0.14353633880615235 0.12318104058504105
Epoch    66: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  67


0.14426064848899842 0.1231509193778038
Epoch:  68


0.1441189992427826 0.12314208894968033
Epoch:  69


0.14346494972705842 0.12325625121593475
Epoch:  70


0.14384352505207063 0.12312952876091003
Epoch:  71


0.1428307616710663 0.1233099952340126
Epoch:  72


0.1429625391960144 0.12317227572202682
Epoch    72: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  73


0.143568554520607 0.12333352714776993
Epoch:  74


0.14318990647792817 0.1232986405491829
Epoch:  75


0.14416904747486115 0.12308970093727112
Epoch:  76


0.14398366332054138 0.12306294590234756
Epoch:  77


0.14392282605171203 0.12315240502357483
Epoch:  78


0.1446743893623352 0.12311240583658219
Epoch:  79
