In [1]:
# Parameters
until_x = 12


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]


def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()


class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 96, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 96, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 96, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
# Instantiate the model
model = Task5Model(31).to(device)

In [12]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [13]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 15:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6543286871910096 0.5834323525428772
Epoch:  1


0.43672601342201234 0.2960379958152771
Epoch:  2


0.23043519556522368 0.20577068924903869
Epoch:  3


0.1868918663263321 0.1773516982793808
Epoch:  4


0.17758744657039643 0.18445484936237336
Epoch:  5


0.17520052909851075 0.16231802105903625
Epoch:  6


0.17295012712478638 0.15621947944164277
Epoch:  7


0.17240698039531707 0.15099193453788756
Epoch:  8


0.16962726294994354 0.1527821958065033
Epoch:  9


0.1707518082857132 0.1733124077320099
Epoch:  10


0.16947850465774536 0.16099117696285248
Epoch:  11


0.16738925695419313 0.15970514714717865
Epoch:  12


0.16745359539985658 0.1442786604166031
Epoch:  13


0.16630713284015655 0.15497478544712068
Epoch:  14


0.1639251947402954 0.14336372911930084
Epoch:  15


0.16239386558532715 0.1486599177122116
Epoch:  16


0.1636151546239853 0.13739128559827804
Epoch:  17


0.1630744767189026 0.14587612748146056
Epoch:  18


0.16290782272815704 0.13844244480133056
Epoch:  19


0.16159988343715667 0.15598607659339905
Epoch:  20


0.16236871123313903 0.15342487692832946
Epoch:  21


0.160343154668808 0.13967240750789642
Epoch:  22


0.15857169806957244 0.1458379715681076
Epoch    22: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  23


0.15863722503185274 0.13465801030397415
Epoch:  24


0.15728444159030913 0.13335544764995574
Epoch:  25


0.15712900042533876 0.13316142112016677
Epoch:  26


0.15738314747810364 0.1323407530784607
Epoch:  27


0.15577354907989502 0.13226161152124405
Epoch:  28


0.1566461890935898 0.1319911450147629
Epoch:  29


0.15556844890117646 0.13170355558395386
Epoch:  30


0.15568695664405824 0.13147073239088058
Epoch:  31


0.15635798871517181 0.13208586871623992
Epoch:  32


0.1566368317604065 0.13117985129356385
Epoch:  33


0.15591140747070312 0.13124041110277176
Epoch:  34


0.15686643064022066 0.13261274695396424
Epoch:  35


0.1540312772989273 0.13025051206350327
Epoch:  36


0.15624141991138457 0.13119718432426453
Epoch:  37


0.15625284612178802 0.1304018348455429
Epoch:  38


0.15580207109451294 0.1306810885667801
Epoch:  39


0.15576818108558654 0.1301541730761528
Epoch:  40


0.1546204787492752 0.1298504501581192
Epoch:  41


0.1545655357837677 0.13148215562105178
Epoch:  42


0.15623551547527315 0.130823914706707
Epoch:  43


0.1551105958223343 0.13046556562185288
Epoch:  44


0.1547502690553665 0.1312392070889473
Epoch:  45


0.1543239724636078 0.13059448301792145
Epoch:  46


0.15485971331596374 0.12998433262109757
Epoch    46: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  47


0.15495248198509215 0.1293675795197487
Epoch:  48


0.15290486812591553 0.12926264554262162
Epoch:  49


0.15504810988903045 0.12935971170663835
Epoch:  50


0.15465627431869508 0.12966016232967376
Epoch:  51


0.15384508848190306 0.12972710877656937
Epoch:  52


0.1537429291009903 0.12909443080425262
Epoch:  53


0.1549519717693329 0.129559525847435
Epoch:  54


0.15464443564414979 0.12922135591506959
Epoch:  55


0.15417649388313293 0.1291843369603157
Epoch:  56


0.15455546021461486 0.12909353226423265
Epoch:  57


0.15452047765254975 0.12933339178562164
Epoch:  58


0.15453279078006743 0.12902319580316543
Epoch:  59


0.15407941341400147 0.12957578152418137
Epoch:  60


0.15423597395420074 0.12863233238458632
Epoch:  61


0.15361433565616608 0.12902256846427917
Epoch:  62


0.15227414071559905 0.12920767217874526
Epoch:  63


0.1544711720943451 0.12901560217142105
Epoch:  64


0.15334204375743865 0.12920631170272828
Epoch:  65


0.15424650251865388 0.12903871834278108
Epoch:  66


0.15366355180740357 0.12909362018108367
Epoch    66: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  67


0.15369996309280395 0.129030279815197
Epoch:  68


0.15420077860355377 0.12888015061616898
Epoch:  69


0.15321365892887115 0.12889156937599183
Epoch:  70


0.1536901032924652 0.12898061871528627
Epoch:  71


0.15325831532478332 0.1293891742825508
Epoch:  72


0.15356655180454254 0.12898222208023072
Epoch    72: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  73


0.1548883056640625 0.12876279056072235
Epoch:  74


0.15446190178394317 0.12900107800960542
Epoch:  75
