In [1]:
# Parameters
until_x = 2


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 96, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 96, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 96, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [12]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [13]:
# Instantiate the model
model = Task5Model(31).to(device)

In [14]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6503826189041138 0.5627627730369568
Epoch:  1


0.43302180409431457 0.26539074778556826
Epoch:  2


0.22852799713611602 0.19692403078079224
Epoch:  3


0.18475649416446685 0.3065942466259003
Epoch:  4


0.177573863863945 0.1695920616388321
Epoch:  5


0.17439380466938018 0.17513583600521088
Epoch:  6


0.17160847067832946 0.1568127304315567
Epoch:  7


0.16893279314041137 1.1454694867134094
Epoch:  8


0.16993558704853057 0.15556360483169557
Epoch:  9


0.16798988819122315 0.15299683511257173
Epoch:  10


0.1666567075252533 0.14546068608760834
Epoch:  11


0.16635370910167693 0.14504075646400452
Epoch:  12


0.16531588971614838 0.1469820111989975
Epoch:  13


0.16494353175163268 0.14578846096992493
Epoch:  14


0.16406561613082885 0.14430294632911683
Epoch:  15


0.16415078938007355 0.14405450820922852
Epoch:  16


0.16198182582855225 0.14034900367259978
Epoch:  17


0.16243912220001222 0.14061622321605682
Epoch:  18


0.1619707316160202 0.14212168157100677
Epoch:  19


0.16017780780792237 0.13775486648082733
Epoch:  20


0.15966288447380067 0.13821967840194702
Epoch:  21


0.15907949984073638 0.13537538945674896
Epoch:  22


0.15878795802593232 0.13247872442007064
Epoch:  23


0.1579084461927414 0.1319470852613449
Epoch:  24


0.15857396841049196 0.13861316442489624
Epoch:  25


0.15776340901851654 0.13111107796430588
Epoch:  26


0.15836692452430726 0.1331631734967232
Epoch:  27


0.1576722139120102 0.1352653130888939
Epoch:  28


0.15854275107383728 0.1356338828802109
Epoch:  29


0.1557937890291214 0.13348150998353958
Epoch:  30


0.1567878645658493 0.12987834066152573
Epoch:  31


0.1558743506669998 0.13187503516674043
Epoch:  32


0.15390956103801728 0.12952140271663665
Epoch:  33


0.15397088587284088 0.13257379680871964
Epoch:  34


0.15543702483177185 0.130134479701519
Epoch:  35


0.15354542195796966 0.1290270447731018
Epoch:  36


0.15276631832122803 0.12832626551389695
Epoch:  37


0.15267661154270173 0.1314850255846977
Epoch:  38


0.1534235519170761 0.1331736534833908
Epoch:  39


0.15367596983909607 0.12896399050951005
Epoch:  40


0.15203442454338073 0.12903160154819487
Epoch:  41


0.15329118967056274 0.12871628254652023
Epoch:  42


0.1525617653131485 0.128507362306118
Epoch    42: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  43


0.15187288105487823 0.1257081925868988
Epoch:  44


0.15047051548957824 0.12511446326971054
Epoch:  45


0.1498885601758957 0.12483199238777161
Epoch:  46


0.1500914579629898 0.12479897141456604
Epoch:  47


0.14976615011692046 0.1247070699930191
Epoch:  48


0.150844886302948 0.125098916888237
Epoch:  49


0.14912760972976685 0.12456323504447937
Epoch:  50


0.14937748849391938 0.12423686236143112
Epoch:  51


0.14891734063625336 0.12423840016126633
Epoch:  52


0.14842282593250275 0.12468475252389907
Epoch:  53


0.1477559471130371 0.12442714124917983
Epoch:  54


0.14878777027130127 0.12449538707733154
Epoch:  55


0.147899090051651 0.12501713037490844
Epoch:  56


0.14781049907207489 0.12463330030441284
Epoch    56: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  57


0.14789870619773865 0.12442089319229126
Epoch:  58


0.14824595749378205 0.12434835582971573
Epoch:  59


0.14883545219898223 0.12417117804288864
Epoch:  60


0.14822565793991088 0.12433676570653915
Epoch:  61


0.14826203346252442 0.124165478348732
Epoch:  62


0.14877331137657165 0.12408123463392258
Epoch:  63


0.14878800570964812 0.1243191733956337
Epoch:  64


0.14812885999679565 0.12407634407281876
Epoch:  65


0.14792850017547607 0.12438485771417618
Epoch:  66


0.14866222381591798 0.12426240593194962
Epoch:  67


0.14767113626003264 0.12418492138385773
Epoch:  68


0.14787052392959596 0.12429383248090745
Epoch    68: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  69


0.1480274146795273 0.12412910461425782
Epoch:  70


0.14810058534145354 0.12417230755090714
Epoch:  71


0.14902882814407348 0.12412609606981277
Epoch:  72


0.14856787025928497 0.12413323372602462
Epoch:  73


0.14799357950687408 0.12424558848142624
Epoch:  74


0.14854374647140503 0.12416302710771561
Epoch    74: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  75


0.14798919022083282 0.12409057915210724
Epoch:  76


0.14931531190872194 0.12427627742290497
Epoch:  77


0.1487017524242401 0.12436246275901794
Epoch:  78


0.1477186745405197 0.12426326721906662
Epoch:  79


0.14814718902111054 0.1243025466799736
Epoch:  80


0.1479215842485428 0.12431416213512421
Epoch    80: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  81


0.14939987599849702 0.1244481086730957
Epoch:  82


0.14944648146629333 0.12424779385328293
Epoch:  83


0.1477648824453354 0.12413091361522674
Epoch:  84


0.14812242984771729 0.12420864105224609
Epoch:  85


0.14725703179836272 0.12403445988893509
Epoch:  86


0.14888474464416504 0.1241547241806984
Epoch:  87


0.14803578436374665 0.12420102059841157
Epoch:  88


0.14866901695728302 0.12422135472297668
Epoch:  89


0.14791036427021026 0.12417863309383392
Epoch:  90


0.149217888712883 0.1241076946258545
Epoch:  91


0.14806274712085724 0.1240760400891304
Epoch:  92


0.14899651527404786 0.1240509495139122
Epoch:  93


0.1478257530927658 0.12412291318178177
Epoch:  94


0.14952123165130615 0.12418544292449951
Epoch:  95


0.14825773894786834 0.12422669529914857
Epoch:  96


0.14859852254390715 0.1242931216955185
Epoch:  97


0.14845252335071563 0.12419487535953522
Epoch:  98


0.14863061368465424 0.12422053217887878
Epoch:  99


0.1481653481721878 0.124067784845829
