In [1]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor



In [2]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [3]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [4]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [5]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [6]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [7]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [8]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [9]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [10]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [11]:
until_x = None

In [12]:
# Parameters
until_x = 4


In [13]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [14]:
# Instantiate the model
model = Task5Model(31).to(device)

In [15]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [16]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6306213333800033 0.4379451743194035
Epoch:  1


0.31829193838544795 0.18414016706602915
Epoch:  2


0.18795447132071932 0.19309353828430176
Epoch:  3


0.17615866822165413 0.1713311161313738
Epoch:  4


0.17152311068934364 0.15923167765140533
Epoch:  5


0.16927814523915988 0.15429132112434932
Epoch:  6


0.16809273450761228 0.14986143154757364
Epoch:  7


0.16608277285421216 0.14959565443652018
Epoch:  8


0.16326524679725235 0.1578683384827205
Epoch:  9


0.16303480114485766 0.145409066762243
Epoch:  10


0.1616145839562287 0.1387814570750509
Epoch:  11


0.16080067165800044 0.1390202556337629
Epoch:  12


0.16122459157093152 0.13719998512949264
Epoch:  13


0.15915006882435567 0.14352321837629592
Epoch:  14


0.15917598556827856 0.13673576606171473
Epoch:  15


0.15826292134620049 0.13731584272214345
Epoch:  16


0.15773294382804148 0.13728194577353342
Epoch:  17


0.15683876984828227 0.1305730545095035
Epoch:  18


0.156825821544673 0.1360327441777502
Epoch:  19


0.15679054364964767 0.13399767769234522
Epoch:  20


0.15510539146693977 0.1336251816579274
Epoch:  21


0.15532452029150887 0.13416609913110733
Epoch:  22


0.15690496604184848 0.13859543949365616
Epoch:  23


0.15601584999947934 0.13263809042317526
Epoch    23: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  24


0.15329698130891128 0.127637152160917
Epoch:  25


0.15268733533653053 0.12754160059349878
Epoch:  26


0.1507593707458393 0.12657940494162695
Epoch:  27


0.15230347135582487 0.12668807485273906
Epoch:  28


0.1510942219882398 0.12612986351762498
Epoch:  29


0.15104505621098183 0.12601429117577417
Epoch:  30


0.15018931876968694 0.12611645140818187
Epoch:  31


0.15124518081948563 0.12582943269184657
Epoch:  32


0.15111935178975802 0.12619621625968389
Epoch:  33


0.149666708466169 0.12534146862370626
Epoch:  34


0.15108241503303116 0.12560386529990605
Epoch:  35


0.15014564628536636 0.1252815148660115
Epoch:  36


0.15038210959047885 0.1252174898982048
Epoch:  37


0.15027287199690537 0.1255964892251151
Epoch:  38


0.15102522800097595 0.12492403494460243
Epoch:  39


0.1506147376588873 0.12458285370043345
Epoch:  40


0.14997090721452558 0.1254958552973611
Epoch:  41


0.14869167796663335 0.12494593645845141
Epoch:  42


0.15015486204946363 0.12480648926326207
Epoch:  43


0.14934972532697627 0.12528132647275925
Epoch:  44


0.1489978029921248 0.1248063359941755
Epoch:  45


0.14906326902879252 0.1250043287873268
Epoch    45: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  46


0.14898614424305992 0.1246831693819591
Epoch:  47


0.1489876907419514 0.12454433739185333
Epoch:  48


0.14972481453740918 0.12440072957958494
Epoch:  49


0.14961865463772336 0.12438219253505979
Epoch:  50


0.14891805842116074 0.12448811743940626
Epoch:  51


0.14885219488594983 0.12429028962339674
Epoch:  52


0.1483859709791235 0.12457883251564843
Epoch:  53


0.14862245522640846 0.1244128867983818
Epoch:  54


0.1483317190730894 0.12433258018323354
Epoch:  55


0.14899724520541527 0.12458723996366773
Epoch:  56


0.1494231413345079 0.12449389802558082
Epoch:  57


0.15000930347958127 0.12435626132147652
Epoch    57: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  58


0.14777597261441722 0.12426003175122398
Epoch:  59


0.14794897026306875 0.12411344689982277
Epoch:  60


0.1505345311519262 0.12443490326404572
Epoch:  61


0.14738281393373334 0.12454489086355482
Epoch:  62


0.14700535906327739 0.1242504215666226
Epoch:  63


0.149458782092945 0.12424581285033907
Epoch:  64


0.1480271171879124 0.12435028382710048
Epoch:  65


0.14836084319127574 0.12422683302845274
Epoch    65: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  66


0.14872074086923856 0.12434116750955582
Epoch:  67


0.1483242145261249 0.12447866797447205
Epoch:  68


0.15000461605755058 0.12427957143102374
Epoch:  69


0.14933279519145554 0.12487199689660754
Epoch:  70


0.14999107211022764 0.12458680250814982
Epoch:  71


0.14909149706363678 0.12440221118075508
Epoch    71: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  72


0.14837352609312213 0.12462420761585236
Epoch:  73


0.1479467310615488 0.12453716354710716
Epoch:  74


0.14847505012074033 0.1241665855050087
Epoch:  75


0.14887845234290972 0.12438414778028216
Epoch:  76


0.14893769452700745 0.12419122989688601
Epoch:  77


0.14932335671540853 0.12446481095893043
Epoch:  78


0.1487222998528867 0.12468583988291877
Epoch:  79


0.14863549495065534 0.12416033659662519
Epoch:  80


0.14933521844245293 0.12435372705970492
Epoch:  81


0.14822964853531606 0.1246045742716108
Epoch:  82


0.14764706105799288 0.12444375455379486
Epoch:  83


0.14878878963960185 0.1246179821235793
Epoch:  84
