In [1]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [2]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [3]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [4]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [5]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [6]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [7]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [8]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [9]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [10]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [11]:
until_x = None

In [12]:
# Parameters
until_x = 4


In [13]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [14]:
# Instantiate the model
model = Task5Model(31).to(device)

In [15]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [16]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6408519487123232 0.51581871509552
Epoch:  1


0.33217674413242854 0.19733040460518428
Epoch:  2


0.18641500859647184 0.16503086260386876
Epoch:  3


0.17457533245151108 0.16648546074117934
Epoch:  4


0.1703131811038868 0.14623441014971053
Epoch:  5


0.16826429721471425 0.14887359951223647
Epoch:  6


0.165820342463416 0.140341440481799
Epoch:  7


0.16478254062098427 0.14626832519258773
Epoch:  8


0.163558183086885 0.14581795249666488
Epoch:  9


0.16360683054537387 0.13736784671034133
Epoch:  10


0.16104186145034996 0.14281695868287766
Epoch:  11


0.1614085802355328 0.13768428883382253
Epoch:  12


0.16054302897002246 0.1373929093991007
Epoch:  13


0.16079953957248377 0.13548690293516433
Epoch:  14


0.15902840889788963 0.14106295151369913
Epoch:  15


0.15740868207570669 0.13889009079762868
Epoch:  16


0.15777962836059364 0.14301430327551706
Epoch:  17


0.1580564009176718 0.13312737111534392
Epoch:  18


0.1581323517335428 0.13664968737534114
Epoch:  19


0.15714547078351718 0.13774718237774713
Epoch:  20


0.1562445771855277 0.1324371216552598
Epoch:  21


0.15551875733040474 0.13069308123418263
Epoch:  22


0.15684311615454183 0.13351772086960928
Epoch:  23


0.15427537222166318 0.13169747591018677
Epoch:  24


0.15686652748971372 0.13315227734191076
Epoch:  25


0.1555463498508608 0.13424735303436006
Epoch:  26


0.15511560560883703 0.13482172255005156
Epoch:  27


0.15530722205703323 0.13002547515290125
Epoch:  28


0.15388373547309153 0.13055264417614257
Epoch:  29


0.15504953305463534 0.13046546493257796
Epoch:  30


0.15454589635939212 0.13126346894672938
Epoch:  31


0.15189254001991168 0.1330120446426528
Epoch:  32


0.15305104972542943 0.13325320503541402
Epoch:  33


0.15251827723271139 0.13026950934103557
Epoch    33: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  34


0.15203575024733673 0.12574490691934312
Epoch:  35


0.14933985914732958 0.12509394543511526
Epoch:  36


0.15038522997417966 0.12449783299650465
Epoch:  37


0.14967518239407926 0.12510464021137782
Epoch:  38


0.14788492587772575 0.1243574321269989
Epoch:  39


0.14898374676704407 0.12427964274372373
Epoch:  40


0.1487376830867819 0.12362434715032578
Epoch:  41


0.14907393624653686 0.12395521679094859
Epoch:  42


0.14842608973786636 0.1245691989149366
Epoch:  43


0.1484538417410206 0.12383440136909485
Epoch:  44


0.14753348803198016 0.12431407826287406
Epoch:  45


0.149546760965038 0.12462938683373588
Epoch:  46


0.1475714823684177 0.12444133843694415
Epoch    46: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  47


0.14719443989766612 0.12392913443701607
Epoch:  48


0.14801795780658722 0.12421662360429764
Epoch:  49


0.14619922396299 0.12387833957161222
Epoch:  50


0.14781763626111522 0.1240990161895752
Epoch:  51


0.14899101410363172 0.12401434353419713
Epoch:  52


0.146618986451948 0.12399079544203621
Epoch    52: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  53


0.1475933117640985 0.12450529315641948
Epoch:  54


0.1476953625679016 0.12402166639055524
Epoch:  55


0.14725587657980016 0.12396198723997388
Epoch:  56


0.14773730128198057 0.12397636473178864
Epoch:  57


0.14717962089422587 0.12401283532381058
Epoch:  58


0.1467461940404531 0.12398653477430344
Epoch    58: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  59


0.1471084420745437 0.12404880140508924
Epoch:  60


0.14756422026737318 0.12393420402492795
Epoch:  61


0.14810721576213837 0.12390649850879397
Epoch:  62


0.14718682862616875 0.12381202088935035
Epoch:  63


0.14754055319605647 0.12384714824812752
Epoch:  64


0.147295183426625 0.12424905278853007
Epoch    64: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  65
