In [1]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor



In [2]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [3]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [4]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [5]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [6]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [7]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [8]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [9]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [10]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [11]:
until_x = None

In [12]:
# Parameters
until_x = 0


In [13]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [14]:
# Instantiate the model
model = Task5Model(31).to(device)

In [15]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [16]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6373014385635788 0.49071684905460905
Epoch:  1


0.3345387098756996 0.18887025117874146
Epoch:  2


0.1852054438880972 0.15857602655887604
Epoch:  3


0.1688899381740673 0.1461585270506995
Epoch:  4


0.1635621190071106 0.14490250923803874
Epoch:  5


0.16251366887543653 0.15926831002746308
Epoch:  6


0.16159597441956802 0.14593054141317094
Epoch:  7


0.1593966927077319 0.13643005170992442
Epoch:  8


0.15822428102428848 0.13382041135004588
Epoch:  9


0.1581193055655505 0.13391374264444625
Epoch:  10


0.1562866757850389 0.12960432789155416
Epoch:  11


0.1554195812425098 0.1291420619402613
Epoch:  12


0.1550005517295889 0.13029428784336364
Epoch:  13


0.15329692009333018 0.12717296608856746
Epoch:  14


0.15334669883186752 0.1298155209847859
Epoch:  15


0.15269082502738848 0.13090561756065913
Epoch:  16


0.15368358428413803 0.12801404297351837
Epoch:  17


0.15175593342330004 0.1277843096426555
Epoch:  18


0.15231392592997164 0.12631618550845555
Epoch:  19


0.15197360394774256 0.12750656583479472
Epoch:  20


0.15087344034298047 0.132224118070943
Epoch:  21


0.1520870327949524 0.12921540119818278
Epoch:  22


0.15221361252101692 0.12601355676140105
Epoch:  23


0.151268566379676 0.12612358587128775
Epoch:  24


0.1519476018241934 0.1274547278881073
Epoch:  25


0.1500865853316075 0.12733598904950277
Epoch:  26


0.14990434574114309 0.12728199256317957
Epoch:  27


0.14898047978813583 0.13247881936175482
Epoch:  28


0.14939761806178736 0.12508990509169443
Epoch:  29


0.15012352651840932 0.12747044648442948
Epoch:  30


0.14950944080546097 0.1276578509381839
Epoch:  31


0.1487700141764976 0.12557534128427505
Epoch:  32


0.1479740541529011 0.13188617357185908
Epoch:  33


0.14929396276538437 0.12538460535662516
Epoch:  34


0.1488280461446659 0.12900666573217937
Epoch    34: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  35


0.14607214081931758 0.12245195997612816
Epoch:  36


0.14533415034010605 0.12269200703927449
Epoch:  37


0.14504598524119403 0.12306686277900424
Epoch:  38


0.1449079122897741 0.12310016580990382
Epoch:  39


0.1448539128980121 0.12278772464820317
Epoch:  40


0.14396456286713882 0.12290836125612259
Epoch:  41


0.14289926435496356 0.12237306364945003
Epoch:  42


0.1436841648978156 0.12223181767123086
Epoch:  43


0.14392991525095863 0.12250764561550957
Epoch:  44


0.14221428737447067 0.12178916696991239
Epoch:  45


0.1423653936063921 0.12248554932219642
Epoch:  46


0.14314538441799782 0.12209035988364901
Epoch:  47


0.1440331005566829 0.12201612549168724
Epoch:  48


0.14281830473526105 0.1217179000377655
Epoch:  49


0.14247493647240303 0.12294306925364903
Epoch:  50


0.14353531317130938 0.12213381060532161
Epoch:  51


0.14243994331037677 0.12240912339517049
Epoch:  52


0.14401772175286268 0.12249046670539039
Epoch:  53


0.14240630736222137 0.12234181059258324
Epoch:  54


0.14278192818164825 0.12201846923146929
Epoch    54: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  55


0.14204339844149513 0.12221408528941018
Epoch:  56


0.14211979105665878 0.12216420152357646
Epoch:  57


0.14172210685304693 0.1221958217876298
Epoch:  58


0.14198762822795558 0.1220286392739841
Epoch:  59


0.14146534212537715 0.12197998485394887
Epoch:  60


0.14313187953588125 0.12209759439740862
Epoch    60: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  61


0.14112947518761093 0.12227157822677068
Epoch:  62


0.14210208646349004 0.12205055994646889
Epoch:  63


0.1417858069007461 0.12208271239485059
Epoch:  64


0.14116593027437055 0.1220698601433209
Epoch:  65


0.1418781671169642 0.12217356158154351
Epoch:  66


0.14286506498182142 0.12196181608097893
Epoch    66: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  67


0.1414867041481508 0.12185877242258616
Epoch:  68


0.14112787552782008 0.12234737298318318
Epoch:  69


0.14123592183396622 0.122178815305233
Epoch:  70


0.14154454783813372 0.12186811970812934
Epoch:  71


0.14181807878855113 0.12193276413849422
Epoch:  72


0.1408909676042763 0.12203981514487948
Epoch    72: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  73
