In [1]:
# Parameters
until_x = 9


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]


def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()


class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 96, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 96, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 96, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
# Instantiate the model
model = Task5Model(31).to(device)

In [12]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [13]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 15:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6587359142303467 0.5883835196495056
Epoch:  1


0.4392207193374634 0.2918616235256195
Epoch:  2


0.2292348998785019 0.18838481605052948
Epoch:  3


0.1857980263233185 0.16497713029384614
Epoch:  4


0.17677350640296935 0.17220637202262878
Epoch:  5


0.17405702531337738 0.15697377920150757
Epoch:  6


0.17239597260951997 0.36321288645267485
Epoch:  7


0.17035561740398408 0.16935224533081056
Epoch:  8


0.16973202764987946 0.18366333842277527
Epoch:  9


0.16872823417186736 0.1725533425807953
Epoch:  10


0.16680540263652802 0.1496794492006302
Epoch:  11


0.166375852227211 0.1683986872434616
Epoch:  12


0.16506725788116455 0.1507927656173706
Epoch:  13


0.16401248335838317 0.1539863109588623
Epoch:  14


0.16310350775718688 0.16299068033695222
Epoch:  15


0.16303162932395934 0.15354839861392974
Epoch:  16


0.16280887246131898 0.14312218725681305
Epoch:  17


0.16178772568702698 0.1402403384447098
Epoch:  18


0.16120694279670716 0.14089326560497284
Epoch:  19


0.15965511679649352 0.141723108291626
Epoch:  20


0.16004110515117645 0.13798002302646636
Epoch:  21


0.15888152539730072 0.1411157429218292
Epoch:  22


0.15975120663642883 0.13670848906040192
Epoch:  23


0.1593480771780014 0.1360570967197418
Epoch:  24


0.15829259574413299 0.14973950982093812
Epoch:  25


0.1573382806777954 0.14156381487846376
Epoch:  26


0.15812863349914552 0.14182053804397582
Epoch:  27


0.15693121314048766 0.13525510281324388
Epoch:  28


0.15832501113414765 0.1602180629968643
Epoch:  29


0.15592752277851105 0.1352705791592598
Epoch:  30


0.15662881314754487 0.1318500444293022
Epoch:  31


0.1551241821050644 0.16206844747066498
Epoch:  32


0.1562410092353821 0.13368071913719176
Epoch:  33


0.1543529200553894 0.13326553255319595
Epoch:  34


0.15518709361553193 0.13866526484489441
Epoch:  35


0.1547038722038269 0.13290049731731415
Epoch:  36


0.15548249065876008 0.1773837298154831
Epoch    36: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  37


0.1522914409637451 0.12666646242141724
Epoch:  38


0.15216138899326326 0.12626692950725554
Epoch:  39


0.1515072041749954 0.12611613869667054
Epoch:  40


0.15265326976776122 0.1258099317550659
Epoch:  41


0.15234839498996736 0.12721793353557587
Epoch:  42


0.1515764808654785 0.12610256522893906
Epoch:  43


0.15181540369987487 0.12581271529197693
Epoch:  44


0.15158931493759156 0.12548792660236358
Epoch:  45


0.15108478784561158 0.12619297802448273
Epoch:  46


0.1509614509344101 0.1264099135994911
Epoch:  47


0.15177266240119935 0.12588272243738174
Epoch:  48


0.1506862610578537 0.12554099410772324
Epoch:  49


0.15030734598636628 0.12552817314863204
Epoch:  50


0.15291886687278747 0.12575821876525878
Epoch    50: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  51


0.15011807799339294 0.12544677406549454
Epoch:  52


0.15015196323394775 0.12543182373046874
Epoch:  53


0.1504284507036209 0.12553820312023162
Epoch:  54


0.15021516740322113 0.12536699920892716
Epoch:  55


0.15104528963565828 0.1253685861825943
Epoch:  56


0.14973298847675323 0.12544019669294357
Epoch:  57


0.1511532473564148 0.12542904168367386
Epoch:  58


0.1501842510700226 0.12550510913133622
Epoch:  59


0.1505367398262024 0.12539847195148468
Epoch:  60


0.15078581392765045 0.12543295174837113
Epoch    60: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  61


0.15000875651836396 0.12556353360414504
Epoch:  62


0.15196162581443787 0.12533216029405594
Epoch:  63


0.1510659223794937 0.12514224797487258
Epoch:  64


0.1498968195915222 0.12530662566423417
Epoch:  65


0.1491846776008606 0.12535765767097473
Epoch:  66


0.15036206543445588 0.12525895833969117
Epoch:  67


0.15147121489048004 0.12538850158452988
Epoch:  68


0.15094719111919402 0.1255550801753998
Epoch:  69


0.1493292647600174 0.12545238733291625
Epoch    69: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  70


0.15036742389202118 0.12521835565567016
Epoch:  71


0.14996911942958832 0.1252521365880966
Epoch:  72


0.14977041006088257 0.12534946501255034
Epoch:  73


0.15052822828292847 0.12531960606575013
Epoch:  74


0.1500977087020874 0.12555303424596786
Epoch:  75


0.15174854099750518 0.12564848661422728
Epoch    75: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  76


0.15085187077522277 0.12548169642686843
Epoch:  77


0.15067133903503419 0.12536414861679077
Epoch:  78
