In [1]:
# Parameters
until_x = 4


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [12]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [13]:
# Instantiate the model
model = Task5Model(31).to(device)

In [14]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6050390206478737 0.45745579259736197
Epoch:  1


0.2742361160548958 0.21451182876314437
Epoch:  2


0.18101556357499715 0.16438796477658407
Epoch:  3


0.17313920847467473 0.17127234169415065
Epoch:  4


0.1691390255818496 0.1520776982818331
Epoch:  5


0.167705293845486 0.15402300655841827
Epoch:  6


0.16565026060954943 0.15304787456989288
Epoch:  7


0.16233615899408185 0.14168690677200044
Epoch:  8


0.16192526068236376 0.1424912161060742
Epoch:  9


0.16174071262011658 0.14057573250361852
Epoch:  10


0.15893411072524818 0.13683775705950602
Epoch:  11


0.15933478200757825 0.13683059066534042
Epoch:  12


0.1569124045404228 0.13451488528932845
Epoch:  13


0.15869399542744095 0.13170901153768813
Epoch:  14


0.15626571951685725 0.13283426527466094
Epoch:  15


0.15529039582690676 0.13141698709556035
Epoch:  16


0.15499990050857132 0.13546377633299148
Epoch:  17


0.15366536862141378 0.13308992769036973
Epoch:  18


0.15378565643284772 0.13345631531306676
Epoch:  19


0.15474372014806076 0.13438216064657485
Epoch:  20


0.15435715180796547 0.12966090334313257
Epoch:  21


0.1533160644608575 0.1276800941143717
Epoch:  22


0.1533263156542907 0.13318553354058946
Epoch:  23


0.15286375703038396 0.12762838921376637
Epoch:  24


0.15326415405080124 0.13032221581254685
Epoch:  25


0.15130490265988014 0.13046416214534215
Epoch:  26


0.15379903525919528 0.1301658856017249
Epoch:  27


0.15145341608975385 0.1295436450413295
Epoch:  28


0.14968427653248245 0.12824697260345733
Epoch:  29


0.15141847608862696 0.12910680366413935
Epoch    29: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  30


0.14972911050190796 0.1255055825625147
Epoch:  31


0.14937806169728976 0.125221420611654
Epoch:  32


0.1487244697841438 0.12456323206424713
Epoch:  33


0.148351732540775 0.12443863387618746
Epoch:  34


0.14870852352799596 0.12459019358669009
Epoch:  35


0.1485300776926247 0.12440755537578038
Epoch:  36


0.14778705182913188 0.123884950365339
Epoch:  37


0.14824455090471217 0.12390878690140587
Epoch:  38


0.14803222868893598 0.12359535694122314
Epoch:  39


0.14792541716549848 0.12409391679934093
Epoch:  40


0.14885906915406924 0.12344335338899068
Epoch:  41


0.14741420826396426 0.12316392574991498
Epoch:  42


0.14707223869658806 0.12297279281275612
Epoch:  43


0.14634525212081703 0.12372781016996928
Epoch:  44


0.14727695447367592 0.1234732483114515
Epoch:  45


0.146696826896152 0.12331141105719975
Epoch:  46


0.1483744568115956 0.12322573470217842
Epoch:  47


0.14687714343135422 0.12378653777497155
Epoch:  48


0.1479291436639992 0.12340312876871654
Epoch    48: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  49


0.14745535963290446 0.1234722403543336
Epoch:  50


0.14751573470798698 0.12333717835800988
Epoch:  51


0.1455067025648581 0.12343345476048333
Epoch:  52


0.14689404134814804 0.1233307101896831
Epoch:  53


0.1469608424483119 0.12300518367971693
Epoch:  54


0.14610147637289925 0.12331485109669822
Epoch    54: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  55


0.14714491407613497 0.12314820502485548
Epoch:  56


0.14690319348025965 0.12321570302758898
Epoch:  57


0.1462862002285751 0.12346238004309791
Epoch:  58


0.1470638876025741 0.12297820832048144
Epoch:  59


0.14782223951172185 0.12312130842890058
Epoch:  60


0.14655099889716586 0.12328362252031054
Epoch    60: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  61


0.14631241157248215 0.12315929255315236
Epoch:  62


0.1462477449629758 0.12331195920705795
Epoch:  63


0.14677500563698845 0.1230510686125074
Epoch:  64


0.14690832954806252 0.12306998244353703
Epoch:  65


0.14766457918527964 0.1233234458736011
Epoch:  66


0.1470796353108174 0.12340693920850754
Epoch    66: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  67
