In [1]:
# Parameters
until_x = 5


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [12]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [13]:
# Instantiate the model
model = Task5Model(31).to(device)

In [14]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6102917178257091 0.46238259758268085
Epoch:  1


0.2809746970196028 0.18349517456122807
Epoch:  2


0.18132363179245511 0.169491069657462
Epoch:  3


0.172931180612461 0.15612810211522238
Epoch:  4


0.17029246808709325 0.16596191482884542
Epoch:  5


0.16899711457458702 0.15465885400772095
Epoch:  6


0.16612198183665405 0.15300078051430838
Epoch:  7


0.16433322107469714 0.1460318054471697
Epoch:  8


0.16361392671997482 0.14018865142549788
Epoch:  9


0.16140185497902534 0.13782386694635665
Epoch:  10


0.16171302787355474 0.14178315656525747
Epoch:  11


0.16012333091851827 0.1334767628993307
Epoch:  12


0.15916565785536896 0.13705982800040925
Epoch:  13


0.15921280955946124 0.1331633808357375
Epoch:  14


0.15818354326325493 0.14359510902847564
Epoch:  15


0.15644951446636304 0.13528573513031006
Epoch:  16


0.15709804683118253 0.13367121453796113
Epoch:  17


0.15618518880895665 0.13177256392581121
Epoch:  18


0.15601958858000264 0.14319401340825216
Epoch:  19


0.15650031010846835 0.1344446571809905
Epoch:  20


0.15581773181219358 0.1315493243081229
Epoch:  21


0.1535455793947787 0.13024540777717317
Epoch:  22


0.1557973619248416 0.13599575417382376
Epoch:  23


0.15311195479856954 0.13210334309509822
Epoch:  24


0.15399368187865695 0.12832312818084443
Epoch:  25


0.15350271600323753 0.13239140489271709
Epoch:  26


0.1538075107980419 0.12976512951510294
Epoch:  27


0.15332101527098063 0.12995370371001108
Epoch:  28


0.1526705256185016 0.12688638802085603
Epoch:  29


0.15431396945102796 0.12826669641903468
Epoch:  30


0.15228888070261157 0.12888020277023315
Epoch:  31


0.15211702399962657 0.1301894560456276
Epoch:  32


0.15177731215953827 0.12927016402993882
Epoch:  33


0.15233740170259732 0.13109092627252852
Epoch:  34


0.15223570127744931 0.13005255162715912
Epoch    34: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  35


0.15060125170527278 0.1252517370241029
Epoch:  36


0.14897668160296776 0.12498084455728531
Epoch:  37


0.14832173529508952 0.12492408071245466
Epoch:  38


0.14889863777805018 0.12450192549398967
Epoch:  39


0.14911597403320107 0.12452575777258192
Epoch:  40


0.14950019362810496 0.12538915872573853
Epoch:  41


0.1495020619115314 0.12442437665803092
Epoch:  42


0.14764446060399752 0.12475552622761045
Epoch:  43


0.14874837487130552 0.12451933537210737
Epoch:  44


0.1483441482524614 0.12409751755850655
Epoch:  45


0.14891903787045865 0.12467593061072486
Epoch:  46


0.14909046566164172 0.124485616173063
Epoch:  47


0.14844612213405403 0.12432009833199638
Epoch:  48


0.14857678598648794 0.12516444389309203
Epoch:  49


0.1478618608938681 0.12495561476264681
Epoch:  50


0.14666966610663645 0.12541874817439488
Epoch    50: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  51


0.14792321864012126 0.12420928478240967
Epoch:  52


0.14772468563672658 0.12424498796463013
Epoch:  53


0.1482811207706864 0.12477521917649678
Epoch:  54


0.14789444610879227 0.1244155541062355
Epoch:  55


0.14782095841459325 0.12442262790032796
Epoch:  56


0.14686629981608004 0.12467477257762637
Epoch    56: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  57


0.14778655525800344 0.125335870044572
Epoch:  58


0.14848605080230817 0.12463947066238948
Epoch:  59


0.14772912658549645 0.12420824276549476
Epoch:  60


0.14712679668052778 0.12465868783848626
Epoch:  61


0.14769890380872264 0.12452389406306404
Epoch:  62


0.14834833950609774 0.12476567817585808
Epoch    62: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  63


0.14728297414006414 0.12463632332427162
Epoch:  64


0.14838629316639257 0.12436646116631371
Epoch:  65


0.14701981157869906 0.1243826225399971
Epoch:  66


0.14834429162579613 0.12465033254453114
Epoch:  67


0.14734622473652298 0.1243817103760583
Epoch:  68


0.1469493903018333 0.12463925565992083
Epoch    68: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  69
