In [1]:
# Parameters
until_x = 3


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]


def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()


class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 96, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 96, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 96, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
# Instantiate the model
model = Task5Model(31).to(device)

In [12]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [13]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 15:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6567560577392578 0.58302001953125
Epoch:  1


0.4414819717407227 0.2995952069759369
Epoch:  2


0.22573905766010285 0.16448284387588502
Epoch:  3


0.1766502106189728 0.15945940911769868
Epoch:  4


0.16913988292217255 0.15038475394248962
Epoch:  5


0.16447876811027526 0.1527401328086853
Epoch:  6


0.16297100722789765 0.1407250553369522
Epoch:  7


0.15956954896450043 0.1393130362033844
Epoch:  8


0.15874445736408233 0.1412087082862854
Epoch:  9


0.15899997353553771 0.14035780727863312
Epoch:  10


0.15847420394420625 0.13813005685806273
Epoch:  11


0.15803078889846803 0.13173198252916335
Epoch:  12


0.15674375951290132 0.1371247425675392
Epoch:  13


0.15524216175079344 0.13523351550102233
Epoch:  14


0.15598948299884796 0.13577932715415955
Epoch:  15


0.1568218672275543 0.13277364224195481
Epoch:  16


0.15472049236297608 0.13304833620786666
Epoch:  17


0.15462761998176575 0.1315385565161705
Epoch:  18


0.15463962018489838 0.1350701108574867
Epoch:  19


0.15345924437046052 0.13310312926769258
Epoch:  20


0.1528605830669403 0.13084886223077774
Epoch:  21


0.15222039997577666 0.13063486516475678
Epoch:  22


0.15277581214904784 0.12820155322551727
Epoch:  23


0.15252468645572662 0.14128448963165283
Epoch:  24


0.1535150068998337 0.12922823429107666
Epoch:  25


0.15208152711391448 0.12726948261260987
Epoch:  26


0.15123666763305665 0.1280772104859352
Epoch:  27


0.15056484937667847 0.12779928296804427
Epoch:  28


0.1501064383983612 0.12577348500490187
Epoch:  29


0.15159461677074432 0.12818150967359543
Epoch:  30


0.1500217056274414 0.12707479298114777
Epoch:  31


0.14955231487751008 0.12612724155187607
Epoch:  32


0.15130705416202544 0.12979990243911743
Epoch:  33


0.14896874129772186 0.12796325832605362
Epoch:  34


0.14949635088443755 0.1285214900970459
Epoch    34: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  35


0.1470815658569336 0.12342004328966141
Epoch:  36


0.14793681502342224 0.12268780767917634
Epoch:  37


0.14634529769420623 0.1229078158736229
Epoch:  38


0.1462879067659378 0.12270490527153015
Epoch:  39


0.14603114187717436 0.1226965069770813
Epoch:  40


0.1463611614704132 0.1225129321217537
Epoch:  41


0.14579423785209655 0.12213049679994584
Epoch:  42


0.14514870464801788 0.12254151403903961
Epoch:  43


0.14492656350135802 0.12271147817373276
Epoch:  44


0.14431745886802674 0.12234140336513519
Epoch:  45


0.1462956440448761 0.12245650887489319
Epoch:  46


0.14511637806892394 0.1226694330573082
Epoch:  47


0.14445168972015382 0.12230780273675919
Epoch    47: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  48


0.1447041916847229 0.12212101817131042
Epoch:  49


0.14472612500190735 0.12210755795240402
Epoch:  50


0.14368273437023163 0.12219059467315674
Epoch:  51


0.14489708364009857 0.12223571538925171
Epoch:  52


0.1452515208721161 0.1224021553993225
Epoch:  53


0.14427359223365785 0.12209633737802505
Epoch:  54


0.1438467162847519 0.12237496078014373
Epoch:  55


0.14488088428974152 0.12236437052488328
Epoch    55: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  56


0.1450035762786865 0.12223707288503646
Epoch:  57


0.1447390902042389 0.12237509489059448
Epoch:  58


0.14482672691345214 0.12229903191328048
Epoch:  59


0.1446194714307785 0.12227283120155334
Epoch:  60


0.14409579932689667 0.12227837443351745
Epoch:  61


0.1451320868730545 0.12219811081886292
Epoch    61: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  62


0.14378900766372682 0.1225961059331894
Epoch:  63


0.1446105271577835 0.12249300181865692
Epoch:  64


0.1456017291545868 0.12231151908636093
Epoch:  65


0.14528596460819243 0.12217615395784379
Epoch:  66


0.14548191785812378 0.12231216579675674
Epoch:  67


0.14370192348957062 0.1222723364830017
Epoch    67: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  68
