In [1]:
# Parameters
until_x = 17


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [12]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [13]:
# Instantiate the model
model = Task5Model(31).to(device)

In [14]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6076412144544963 0.4714796713420323
Epoch:  1


0.2743238146240647 0.16382795998028346
Epoch:  2


0.16795018436135473 0.14430895554167883
Epoch:  3


0.1590895237954887 0.13384577738387243
Epoch:  4


0.1560718461468413 0.13060480994837625
Epoch:  5


0.15563282008106644 0.1308929036770548
Epoch:  6


0.15401438200795972 0.13645166052239283
Epoch:  7


0.15438606006068153 0.1319257840514183
Epoch:  8


0.15256294769209786 0.12763578018971852
Epoch:  9


0.15136760151064074 0.1306618175336293
Epoch:  10


0.15032990236540097 0.14062738418579102
Epoch:  11


0.15101668923287778 0.1323635961328234
Epoch:  12


0.15057984155577583 0.12780460183109557
Epoch:  13


0.15112238800203479 0.12862231050218856
Epoch:  14


0.14919673711866946 0.1287700640303748
Epoch    14: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  15


0.14742341234877304 0.12124109374625343
Epoch:  16


0.14583343549354658 0.1213820076414517
Epoch:  17


0.1462720503678193 0.12116619944572449
Epoch:  18


0.14549539097257563 0.1210342411484037
Epoch:  19


0.1458371438690134 0.12095180473157338
Epoch:  20


0.1459024713651554 0.12043645871537072
Epoch:  21


0.14487395214067922 0.12099742037909371
Epoch:  22


0.1453270710803367 0.12089078234774726
Epoch:  23


0.14499822097855644 0.12078793666192464
Epoch:  24


0.1447691905337411 0.12130483133452279
Epoch:  25


0.14451283297023257 0.12063233980110713
Epoch:  26


0.14447693446198026 0.12080545510564532
Epoch    26: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  27


0.14384125052271662 0.12073715882641929
Epoch:  28


0.14407974260078893 0.12055812350341252
Epoch:  29


0.14354932912298152 0.12035950273275375
Epoch:  30


0.14365319746571617 0.1205685990197318
Epoch:  31


0.1429650702992001 0.12046928703784943
Epoch:  32


0.14495872726311554 0.12049266908849988
Epoch:  33


0.14422626108736605 0.12020385478224073
Epoch:  34


0.14380934230379155 0.12024505862167903
Epoch:  35


0.14399178326129913 0.12027797422238759
Epoch:  36


0.14323630002704826 0.12039563485554286
Epoch:  37


0.1436770904708553 0.12024274681295667
Epoch:  38


0.14318670735165878 0.12039638949292046
Epoch:  39


0.1440849066586108 0.12009573834283012
Epoch:  40


0.143022451851819 0.12019202751772744
Epoch:  41


0.14386409198915637 0.12029097761426653
Epoch:  42


0.14339152582593867 0.12033652833529881
Epoch:  43


0.14449795234847712 0.12012830270188195
Epoch:  44


0.14389896070634997 0.12043600955179759
Epoch:  45


0.14375794054688634 0.12037848255464009
Epoch    45: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  46


0.14347122891529188 0.12064650335482188
Epoch:  47


0.1433878238942172 0.12020100333860942
Epoch:  48


0.14344371311567924 0.12042026541062764
Epoch:  49


0.14429314998356071 0.12047842783587319
Epoch:  50


0.14345460083033587 0.12050221860408783
Epoch:  51


0.1433354821559545 0.12018072711569923
Epoch    51: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  52


0.14265267067664378 0.12039869917290551
Epoch:  53


0.14264861031158552 0.12040778036628451
Epoch:  54


0.14273445489438805 0.12021841215235847
Epoch:  55


0.14274235432212418 0.12017717425312315
Epoch:  56


0.14287525938974843 0.12016954592296056
Epoch:  57


0.14372498280293233 0.12048519083431788
Epoch    57: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  58


0.14290540927165263 0.12029906575168882
Epoch:  59


0.14330260455608368 0.12030636519193649
Epoch:  60


0.1433340512417458 0.12027077802589961
Epoch:  61


0.1438276795922099 0.12044585921934672
Epoch:  62


0.1437943458959863 0.12033182382583618
Epoch:  63


0.14317145258993716 0.12051046426807131
Epoch:  64
