In [1]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor



In [2]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [3]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [4]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [5]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [6]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [7]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [8]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [9]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [10]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [11]:
until_x = None

In [12]:
# Parameters
until_x = 10


In [13]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [14]:
# Instantiate the model
model = Task5Model(31).to(device)

In [15]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [16]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6363523699141838 0.48276255811963764
Epoch:  1


0.33248429685025604 0.22394523663180216
Epoch:  2


0.19114712968065933 0.1840169472353799
Epoch:  3


0.18015379317708918 0.17394252760069712
Epoch:  4


0.1768884707141567 0.16786809691361018
Epoch:  5


0.17552624199841474 0.16199050630841935
Epoch:  6


0.17517782385284836 0.15578722740922654
Epoch:  7


0.1752438770758139 0.16175833770206996
Epoch:  8


0.17130299879086985 0.16018071557794297
Epoch:  9


0.1710887112327524 0.164110466837883
Epoch:  10


0.17099681053612684 0.1597567264522825
Epoch:  11


0.1700706300703255 0.16350706347397395
Epoch:  12


0.1701731279089644 0.15698352668966567
Epoch    12: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  13


0.16746558490637187 0.1468275221330779
Epoch:  14


0.16605065239442363 0.14530190719025476
Epoch:  15


0.16451723793068448 0.14734671264886856
Epoch:  16


0.16469460363323624 0.14432588326079504
Epoch:  17


0.16583055499437693 0.14497999846935272
Epoch:  18


0.1652922976661373 0.1452203488775662
Epoch:  19


0.16401169026220166 0.1441505891936166
Epoch:  20


0.16425786549980576 0.14249650601829802
Epoch:  21


0.16406865417957306 0.14325928581612452
Epoch:  22


0.1628100545825185 0.1417234657066209
Epoch:  23


0.16297408173213135 0.14281079173088074
Epoch:  24


0.16268546758471308 0.1422309194292341
Epoch:  25


0.16334996634238474 0.14021983104092733
Epoch:  26


0.1631477084514257 0.14161493735654013
Epoch:  27


0.16211697459220886 0.14077560710055487
Epoch:  28


0.16377955071023992 0.1391186128769602
Epoch:  29


0.16110137467448776 0.13835746901375906
Epoch:  30


0.16222638818057808 0.14068852471453802
Epoch:  31


0.16212847144217105 0.13782372432095663
Epoch:  32


0.16228157765156515 0.14066805158342635
Epoch:  33


0.1630104264697513 0.1376303189567157
Epoch:  34


0.16147366689669118 0.13661925068923406
Epoch:  35


0.16101053477944555 0.136888430586883
Epoch:  36


0.1608217370671195 0.13650369005543844
Epoch:  37


0.16068102016642288 0.1377862904753004
Epoch:  38


0.15938114153372274 0.13603250043732779
Epoch:  39


0.16098107719743573 0.1365752848131316
Epoch:  40


0.16023463051061373 0.1371260349239622
Epoch:  41


0.1599928277569848 0.1373080611228943
Epoch:  42


0.16058675743438103 0.13667299172707967
Epoch:  43


0.1596274967934634 0.1354874638574464
Epoch:  44


0.1595306070269765 0.13626266483749663
Epoch:  45


0.1611038079938373 0.1349798430289541
Epoch:  46


0.15892116121343663 0.13457313286406652
Epoch:  47


0.16037397167167147 0.13566120181764876
Epoch:  48


0.1591760520194028 0.1346964208143098
Epoch:  49


0.1598911535095524 0.13527400365897588
Epoch:  50


0.1589074255646886 0.13531772792339325
Epoch:  51


0.1594500034242063 0.13428357137101038
Epoch:  52


0.15907854002875252 0.1334587186574936
Epoch:  53


0.15928032108255336 0.1357341366154807
Epoch:  54


0.1583387610074636 0.13459912474666322
Epoch:  55


0.1584993355177544 0.1341696030327252
Epoch:  56


0.1575218229680448 0.13326075885977065
Epoch:  57


0.15776847383460482 0.13286867631333216
Epoch:  58


0.15658637478544907 0.1319893736924444
Epoch:  59


0.1575877336231438 0.13385136957679475
Epoch:  60


0.15702476614230387 0.13237447291612625
Epoch:  61


0.1582872384303325 0.1329554170370102
Epoch:  62


0.15810286153007197 0.13298554186310088
Epoch:  63


0.15636700150128957 0.13261273396866663
Epoch:  64


0.15692974385377523 0.13208333935056413
Epoch    64: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  65


0.15730601186687881 0.131320806486266
Epoch:  66


0.15674682887824806 0.1312659393463816
Epoch:  67


0.1562845779431833 0.13115900435617991
Epoch:  68


0.15500068261816696 0.13115063309669495
Epoch:  69


0.1558240587646897 0.13161083736589976
Epoch:  70


0.15468342441159325 0.13138494853462493
Epoch:  71


0.1566287029433895 0.13089238426515035
Epoch:  72


0.15553090258224592 0.13099435503993714
Epoch:  73


0.15600066249435013 0.13101768493652344
Epoch:  74


0.1564011924170159 0.1307847308261054
Epoch:  75


0.15533499218322136 0.1309760000024523
Epoch:  76


0.155321460317921 0.13109122855322702
Epoch:  77


0.1563301118644508 0.13064435763018473
Epoch:  78


0.15424702054745443 0.13042387685605458
Epoch:  79


0.15532360970973969 0.13103512887443816
Epoch:  80


0.1563996825669263 0.13093767847333634
Epoch:  81


0.15338922513497844 0.1307395984019552
Epoch:  82


0.1551198661327362 0.13073398172855377
Epoch:  83


0.15594052744878306 0.13116599193641118
Epoch:  84


0.15647775337502762 0.13072227154459273
Epoch    84: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  85


0.15497999497362086 0.130761556327343
Epoch:  86


0.1552665072518426 0.13048031287533896
Epoch:  87


0.15642762546603745 0.1308385825582913
Epoch:  88


0.15537906337428736 0.13021618553570338
Epoch:  89


0.15581373910646182 0.13055276232106344
Epoch:  90


0.15598917369906967 0.13060566570077622
Epoch:  91


0.15390480249314695 0.1306116208434105
Epoch:  92


0.15534504523148407 0.13042856859309332
Epoch:  93


0.15609063933024536 0.13059796073607036
Epoch:  94


0.15573795542523666 0.13061347071613585
Epoch    94: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  95


0.1552560619405798 0.13080737633364542
Epoch:  96


0.15402541490825447 0.1305763710822378
Epoch:  97


0.1552710146517367 0.13068701326847076
Epoch:  98


0.1559733108088777 0.13067838230303355
Epoch:  99


0.15388634156536413 0.13057458187852586
