In [1]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [2]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [3]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [4]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [5]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [6]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [7]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [8]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [9]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [10]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [11]:
until_x = None

In [12]:
# Parameters
until_x = 14


In [13]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [14]:
# Instantiate the model
model = Task5Model(31).to(device)

In [15]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [16]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6359614520459562 0.4810963145324162
Epoch:  1


0.3327409476041794 0.200448676943779
Epoch:  2


0.19433722262446945 0.3686089132513319
Epoch:  3


0.18101273799264753 0.1714055005993162
Epoch:  4


0.17909393721335642 0.169865078159741
Epoch:  5


0.17634164642643285 0.1675066841500146
Epoch:  6


0.17743996389814326 0.16762338791574752
Epoch:  7


0.17397732952156583 0.16056728150163377
Epoch:  8


0.17466509704654282 0.1683943612234933
Epoch:  9


0.17287330571058634 0.15778187130178725
Epoch:  10


0.17147992833240613 0.15393650106021337
Epoch:  11


0.1714782239617528 0.17133353863443648
Epoch:  12


0.16976637171732412 0.15534078436238424
Epoch:  13


0.16954318494410128 0.15479938685894012
Epoch:  14


0.1690757737771885 0.16028085350990295
Epoch:  15


0.16991512517671328 0.16649787766592844
Epoch:  16


0.16730618637961311 0.15323868181024278
Epoch:  17


0.1673515100736876 0.15305675991943904
Epoch:  18


0.16641543080677856 0.15270074137619563
Epoch:  19


0.16580953670514598 0.14542569432939803
Epoch:  20


0.16504050066342224 0.15181694711957658
Epoch:  21


0.16452692811553543 0.15510013273784093
Epoch:  22


0.16305469741692413 0.1439571369971548
Epoch:  23


0.16279314820830887 0.1397775677697999
Epoch:  24


0.16122608974173264 0.1456806425537382
Epoch:  25


0.16170901865572543 0.1431033057825906
Epoch:  26


0.16195039894129779 0.14557609600680216
Epoch:  27


0.16093748525993243 0.13585157692432404
Epoch:  28


0.16001057906730756 0.13407392161233084
Epoch:  29


0.16018878326222702 0.13815237049545562
Epoch:  30


0.15910398396285805 0.13464730020080293
Epoch:  31


0.16038927033140854 0.1333667550768171
Epoch:  32


0.16093854404784538 0.13526055003915513
Epoch:  33


0.15990337366993362 0.13466625022036688
Epoch:  34


0.15837888298807917 0.13390391107116426
Epoch:  35


0.15743568660439672 0.1352319781269346
Epoch:  36


0.15820998879703316 0.13461606417383468
Epoch:  37


0.1586711865824622 0.13427759387663432
Epoch    37: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  38


0.15738816357947685 0.13061727157660893
Epoch:  39


0.15461974651426882 0.1300805796469961
Epoch:  40


0.15527511206833092 0.12980748393705913
Epoch:  41


0.15447689391471245 0.13576060320649827
Epoch:  42


0.15527877936492096 0.13218749633857182
Epoch:  43


0.15566045488860156 0.1297508744256837
Epoch:  44


0.15569995585325602 0.12990673631429672
Epoch:  45


0.15419377064382708 0.12952217885426112
Epoch:  46


0.15486172002715035 0.129470200410911
Epoch:  47


0.15521280346689997 0.1293382186974798
Epoch:  48


0.15484698841700684 0.12891179961817606
Epoch:  49


0.15479674049325892 0.12895952378000533
Epoch:  50


0.15510091950764526 0.12842712977102824
Epoch:  51


0.15529939854467237 0.12860208004713058
Epoch:  52


0.15285087477516485 0.12936373906476156
Epoch:  53


0.15361948069688436 0.1298278048634529
Epoch:  54


0.15351377669218425 0.12909517543656485
Epoch:  55


0.15382627740099625 0.12865313781159265
Epoch:  56


0.15375349690785278 0.1293508665902274
Epoch    56: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  57


0.15429317266554446 0.12878150812217168
Epoch:  58


0.15359899603031776 0.12887122056313924
Epoch:  59


0.15443237488334244 0.1288133231656892
Epoch:  60


0.15475436159082362 0.12848095489399775
Epoch:  61


0.1540834702349998 0.12850746725286757
Epoch:  62


0.15348706817304766 0.1286358035036496
Epoch    62: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  63


0.15210329358642166 0.1280343575137002
Epoch:  64


0.15275096651670095 0.1287159664290292
Epoch:  65


0.153481755707715 0.12874474270003183
Epoch:  66


0.15347668286916372 0.12807919617210115
Epoch:  67


0.15280652448937698 0.12841303114380156
Epoch:  68


0.1522550562749038 0.1286154208438737
Epoch:  69


0.1546097784428983 0.1283481770328113
Epoch    69: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  70


0.15358093300381223 0.12809803869043077
Epoch:  71


0.15282448038861557 0.1283485676561083
Epoch:  72


0.15259371456262227 0.1284186989068985
Epoch:  73


0.15419968922395963 0.12885688564607076
Epoch:  74


0.15178939498759605 0.12850191657032287
Epoch:  75


0.15291870123631246 0.12881083360740117
Epoch    75: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  76


0.15440277513620015 0.12856138284717286
Epoch:  77


0.15344574846125938 0.12876953716788972
Epoch:  78


0.15402686193182663 0.12888548310313905
Epoch:  79


0.15420244030050329 0.12858408583062036
Epoch:  80


0.1518741459459872 0.1285203375986644
Epoch:  81


0.15367352197299133 0.12844177122626985
Epoch:  82


0.15376431958095446 0.1284512441073145
Epoch:  83


0.1543749880951804 0.12858122267893382
Epoch:  84


0.1539821725439381 0.12863255185740335
Epoch:  85


0.15343865792493563 0.12802579679659434
Epoch:  86


0.15226508717279177 0.12852556897061213
Epoch:  87


0.15375829387355494 0.12847149159227098
Epoch:  88


0.15345414264782056 0.1285639586193221
Epoch:  89


0.15263103754133792 0.12871223581688745
Epoch:  90


0.154131242149585 0.12832739097731455
Epoch:  91


0.1545547847006772 0.1286160009247916
Epoch:  92


0.15442249541347092 0.1284482404589653
Epoch:  93


0.15310081195186925 0.12848942833287375
Epoch:  94


0.15327395337658958 0.1282659758414541
Epoch:  95


0.15417034843483488 0.12806704533951624
Epoch:  96


0.15376198493145607 0.1286615410021373
Epoch:  97


0.15395586877255826 0.12832886832101004
Epoch:  98


0.15338328762634382 0.1285556639943804
Epoch:  99


0.1539145369787474 0.12821771949529648
