In [1]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [2]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [3]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [4]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [5]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [6]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [7]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [8]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [9]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [10]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [11]:
until_x = None

In [12]:
# Parameters
until_x = 13


In [13]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [14]:
# Instantiate the model
model = Task5Model(31).to(device)

In [15]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [16]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6413206441982372 0.4677909527506147
Epoch:  1


0.3443456872089489 0.2127846053668431
Epoch:  2


0.19651167255801125 0.1827286865030016
Epoch:  3


0.18407089645798141 0.17843293717929296
Epoch:  4


0.1809015906340367 0.17267641638006484
Epoch:  5


0.1794677888219421 0.17211681817259109
Epoch:  6


0.17819573548999992 0.16695932405335562
Epoch:  7


0.17811021128216306 0.1621890174491065
Epoch:  8


0.17614564541223887 0.16930530539580754
Epoch:  9


0.17722528488249392 0.17310409247875214
Epoch:  10


0.17381035717757973 0.16038001435143606
Epoch:  11


0.17490897549165263 0.1526161198105131
Epoch:  12


0.17133437056799192 0.15397632122039795
Epoch:  13


0.17142006070227236 0.15769426311765397
Epoch:  14


0.17220806068665273 0.15765988188130514
Epoch:  15


0.17118352571049253 0.15421046316623688
Epoch:  16


0.16980994311538902 0.15686624816485814
Epoch:  17


0.1695589374851536 0.15471612768513815
Epoch    17: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  18


0.16719583764269547 0.14500360297305243
Epoch:  19


0.1656970748224774 0.144641817680427
Epoch:  20


0.16521399045312726 0.14344004967382976
Epoch:  21


0.16519813803402153 0.1434665471315384
Epoch:  22


0.16512139544293686 0.14237034427268164
Epoch:  23


0.1660682256962802 0.141918383538723
Epoch:  24


0.16547950175968376 0.14201121777296066
Epoch:  25


0.16516112314688192 0.1423633215682847
Epoch:  26


0.16442219429724925 0.14176236412354878
Epoch:  27


0.16523736030668826 0.14234703247036254
Epoch:  28


0.16571235213730787 0.14128669670649938
Epoch:  29


0.16438047225410873 0.1418536018048014
Epoch:  30


0.1643794581696794 0.1411102701510702
Epoch:  31


0.16398929059505463 0.1416218184999057
Epoch:  32


0.16363846087777936 0.14071465390069143
Epoch:  33


0.1633346382830594 0.1404060912983758
Epoch:  34


0.16427237560620178 0.1386858161006655
Epoch:  35


0.1629759765960075 0.14073897898197174
Epoch:  36


0.16324647495875488 0.13887072460991995
Epoch:  37


0.1625783225974521 0.13835655046360834
Epoch:  38


0.16281584428774343 0.13919990190437861
Epoch:  39


0.16250373906380422 0.13822070509195328
Epoch:  40


0.16256539644421758 0.13863255189997808
Epoch:  41


0.16379993992882805 0.139527861561094
Epoch:  42


0.16330807273452347 0.13717958969729288
Epoch:  43


0.16106844592738795 0.13768270718199865
Epoch:  44


0.16183716944746068 0.13805795354502542
Epoch:  45


0.1632462059323852 0.13833982603890554
Epoch:  46


0.16053745956034274 0.13838937133550644
Epoch:  47


0.16152426560182828 0.1375074918781008
Epoch:  48


0.16192154586315155 0.13741526114089148
Epoch    48: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  49


0.160245138648394 0.13713322473423822
Epoch:  50


0.1620759255177266 0.13699092928852355
Epoch:  51


0.1606103214057716 0.13726293508495604
Epoch:  52


0.16123418429413358 0.13708391253437316
Epoch:  53


0.16205930105737737 0.1371030339172908
Epoch:  54


0.16124691270493172 0.13710967557770865
Epoch:  55


0.15922785248305346 0.13665596076420375
Epoch:  56


0.15973813791532773 0.13692800487790788
Epoch:  57


0.16062601232850873 0.13668981939554214
Epoch:  58


0.1613631494141914 0.13676808029413223
Epoch:  59


0.16147296976398778 0.1365622111729213
Epoch:  60


0.16143818401001594 0.13650484063795634
Epoch:  61


0.16130745773379868 0.13676132368189947
Epoch:  62


0.1600402678186829 0.13690509221383504
Epoch:  63


0.16133651056805173 0.13659971207380295
Epoch:  64


0.1612161659711116 0.13653110606329782
Epoch:  65


0.16102725146590052 0.136487321129867
Epoch:  66


0.16108202088523554 0.13615424079554422
Epoch:  67


0.15920199010823224 0.1365690359047481
Epoch:  68


0.16053297551902565 0.13627623660223825
Epoch:  69


0.1602900608165844 0.13655089480536325
Epoch:  70


0.15960567102238937 0.13636244620595658
Epoch:  71


0.1616357721992441 0.13656125643423625
Epoch:  72


0.16034149358401428 0.13639897746699198
Epoch    72: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  73


0.16132198958783536 0.13626594841480255
Epoch:  74


0.16032991457629847 0.1364840641617775
Epoch:  75


0.16068704224921562 0.1365269688623292
Epoch:  76


0.16040261573082693 0.13624623204980577
Epoch:  77


0.158772204776068 0.13615614495107106
Epoch:  78


0.1601815477416322 0.1362932345696858
Epoch    78: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  79


0.1601386122606896 0.13625208607741765
Epoch:  80


0.16115716462199753 0.13650496942656382
Epoch:  81


0.16029933054704923 0.13590948390109198
Epoch:  82


0.16042439760388555 0.13626588881015778
Epoch:  83


0.16117237306929924 0.13640663559947694
Epoch:  84


0.158850106033119 0.13629606259720667
Epoch:  85


0.16106963036833583 0.1363782063126564
Epoch:  86


0.1602388994919287 0.1364349520632199
Epoch:  87


0.16066102764090975 0.13656597584486008
Epoch    87: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  88


0.16048641663950844 0.13655155897140503
Epoch:  89


0.16079513043970675 0.136522898716586
Epoch:  90


0.1618111826278068 0.13621281406709126
Epoch:  91


0.15971765606789975 0.1363348364830017
Epoch:  92


0.16219867645083247 0.1362001331789153
Epoch:  93


0.1603697260489335 0.1361996488911765
Epoch:  94


0.1599215893326579 0.13601453070129668
Epoch:  95


0.16158830595983042 0.13649416927780425
Epoch:  96


0.16004245466477163 0.13659106940031052
Epoch:  97


0.16072559074775591 0.13640482085091726
Epoch:  98


0.16067777936523026 0.13636456536395208
Epoch:  99


0.1607717332002279 0.13636651848043715
