In [1]:
# Parameters
until_x = 3


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [12]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [13]:
# Instantiate the model
model = Task5Model(31).to(device)

In [14]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6079860208807765 0.3974588853972299
Epoch:  1


0.28049931453691945 0.1787582572017397
Epoch:  2


0.18540103089165044 0.26914979943207334
Epoch:  3


0.17606590647955198 0.16395006435258047
Epoch:  4


0.17161610279534315 0.15469904669693538
Epoch:  5


0.16922556024950905 0.1590760690825326
Epoch:  6


0.16935511215313062 0.16840537105287826
Epoch:  7


0.16604404836087613 0.17036013730934688
Epoch:  8


0.1651944417405773 0.15440184942313603
Epoch:  9


0.16500746076171463 0.15016974721636092
Epoch:  10


0.16459413033884926 0.15323335570948465
Epoch:  11


0.1642231308930629 0.14347317389079503
Epoch:  12


0.163059936181919 0.13935453976903642
Epoch:  13


0.16039632341346224 0.14283620885440282
Epoch:  14


0.1603105865620278 0.14172843098640442
Epoch:  15


0.15918689482920878 0.13991499905075347
Epoch:  16


0.16050053488563848 0.1411431027310235
Epoch:  17


0.15913513745810534 0.1356180885008403
Epoch:  18


0.1598504999199429 0.1357439745749746
Epoch:  19


0.1588918014958098 0.1363957513655935
Epoch:  20


0.15958258590182742 0.13520828102316176
Epoch:  21


0.1600286118887566 0.1341691900576864
Epoch:  22


0.15721229806139664 0.13439455521958216
Epoch:  23


0.15721348008593997 0.13717664033174515
Epoch:  24


0.1572700634195998 0.13444960755961283
Epoch:  25


0.1557754757436546 0.13194515236786433
Epoch:  26


0.15837550566003128 0.13411745322602137
Epoch:  27


0.15573151409626007 0.13207098096609116
Epoch:  28


0.1550535406615283 0.1318163275718689
Epoch:  29


0.15475958263551867 0.13070221990346909
Epoch:  30


0.15527450112072197 0.13381705007382802
Epoch:  31


0.15656674472061363 0.1302682427423341
Epoch:  32


0.1551254540681839 0.1316832293357168
Epoch:  33


0.15417795648445953 0.13584432112319128
Epoch:  34


0.1537849971571484 0.13186708412000112
Epoch:  35


0.15383795830043587 0.12921244012457983
Epoch:  36


0.15387385600322 0.13174596215997422
Epoch:  37


0.15362820673633265 0.13045983016490936
Epoch:  38


0.153841560756838 0.12970068731478282
Epoch:  39


0.1522091554628836 0.1289467971239771
Epoch:  40


0.15222554633746277 0.12899237658296311
Epoch:  41


0.15366514510399587 0.12708613808665956
Epoch:  42


0.151932254836366 0.13297976979187556
Epoch:  43


0.15232300838908633 0.1277202559368951
Epoch:  44


0.15257768171864586 0.12895449783120835
Epoch:  45


0.15188860007234523 0.12819443749529974
Epoch:  46


0.1505741754899154 0.1336866712995938
Epoch:  47


0.1504493762512465 0.13037078508308955
Epoch    47: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  48


0.1500622211275874 0.12541378076587403
Epoch:  49


0.148262708573728 0.12515027821063995
Epoch:  50


0.148613537082801 0.12469853780099324
Epoch:  51


0.14878128146803057 0.1250289242182459
Epoch:  52


0.14809635281562805 0.12486822477408818
Epoch:  53


0.14834328438784625 0.12473150768450328
Epoch:  54


0.1484020775234377 0.12461886022772108
Epoch:  55


0.1472369037769936 0.12394674973828453
Epoch:  56


0.1484771181602736 0.12508037367037364
Epoch:  57


0.1473886153182468 0.12555500864982605
Epoch:  58


0.1456558253314044 0.12447207527501243
Epoch:  59


0.14810485614312663 0.1248091853090695
Epoch:  60


0.1489119324329737 0.12471507702554975
Epoch:  61


0.14751775441942988 0.12431413893188749
Epoch    61: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  62


0.14773803224434723 0.12417290253298623
Epoch:  63


0.14954754871291084 0.1244748905301094
Epoch:  64


0.14733224221178004 0.12450018099376134
Epoch:  65


0.1473437624203192 0.12427402181284768
Epoch:  66


0.14829480809134407 0.12457917098488126
Epoch:  67


0.14779967270992897 0.1243463424699647
Epoch    67: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  68


0.14864820644662186 0.12435500004461833
Epoch:  69


0.14679276540472702 0.12435418580259595
Epoch:  70


0.14767468338077133 0.124551922082901
Epoch:  71


0.14812258288667007 0.12432365970952171
Epoch:  72


0.14679045572474198 0.1242375448346138
Epoch:  73


0.14766614622360952 0.1244243276970727
Epoch    73: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  74


0.14714063744287234 0.12416813522577286
Epoch:  75


0.14712558686733246 0.12444456347397395
Epoch:  76


0.1480220886501106 0.12430876067706517
Epoch:  77


0.1471021364669542 0.12451744611774172
Epoch:  78


0.14861342633092725 0.12430470968995776
Epoch:  79


0.14721560921217944 0.1246108774627958
Epoch    79: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  80
