In [1]:
# Parameters
until_x = 2


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [12]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [13]:
# Instantiate the model
model = Task5Model(31).to(device)

In [14]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6113220557973191 0.40519783752305166
Epoch:  1


0.2822694101849118 0.1842621522290366
Epoch:  2


0.18466054748844457 0.17012631041663034
Epoch:  3


0.1780504233933784 0.17041921189853124
Epoch:  4


0.17490116164490982 0.1586468368768692
Epoch:  5


0.1747409701347351 0.16431637746947153
Epoch:  6


0.17123096456398834 0.15653841836111887
Epoch:  7


0.17027266444386663 0.1549873799085617
Epoch:  8


0.16981081986749494 0.15224752042974746
Epoch:  9


0.16846896466371175 0.15335882348673685
Epoch:  10


0.1668598724378122 0.14761360841138021
Epoch:  11


0.16644368743574298 0.14916092370237624
Epoch:  12


0.16757358248169357 0.1480794825724193
Epoch:  13


0.16576893426276543 0.14699398619788034
Epoch:  14


0.16331307992741867 0.1496121542794364
Epoch:  15


0.16373742230840632 0.14056907807077682
Epoch:  16


0.16315616788090886 0.1434141376188823
Epoch:  17


0.16202280489174095 0.1395622736641339
Epoch:  18


0.16144793541044802 0.13499860146215983
Epoch:  19


0.16159061523708138 0.136724591255188
Epoch:  20


0.15996832058236404 0.13652539678982326
Epoch:  21


0.16141114283252406 0.1354948260954448
Epoch:  22


0.16052794738395795 0.1375276425055095
Epoch:  23


0.1590188121473467 0.13357856337513244
Epoch:  24


0.15821062833876223 0.13268190835203444
Epoch:  25


0.15905060558705716 0.13257317457880294
Epoch:  26


0.15871090341258692 0.13892160675355367
Epoch:  27


0.15682292669206052 0.13461054967982428
Epoch:  28


0.15658179730982394 0.1394715000476156
Epoch:  29


0.15515476948506124 0.13151196177516664
Epoch:  30


0.15671814615662033 0.1357157900929451
Epoch:  31


0.15638022608048208 0.1381305754184723
Epoch:  32


0.15632195327733014 0.13243969423430307
Epoch:  33


0.1544435785428898 0.12988601412091935
Epoch:  34


0.15625174061672106 0.13073255441018514
Epoch:  35


0.15397051258667097 0.13334931752511434
Epoch:  36


0.15567289051171895 0.13061805814504623
Epoch:  37


0.1545776922155071 0.1338369186435427
Epoch:  38


0.15481309995457931 0.1305459248168128
Epoch:  39


0.1544441548553673 0.1291811242699623
Epoch:  40


0.1533444499647295 0.1302838602236339
Epoch:  41


0.15388772858155741 0.13398363015481404
Epoch:  42


0.15439262663995898 0.12816468519823893
Epoch:  43


0.15272885117981885 0.12660735419818334
Epoch:  44


0.15329292658213023 0.1313649437257222
Epoch:  45


0.15363722395252538 0.12852132959025248
Epoch:  46


0.15351618705569087 0.12969881296157837
Epoch:  47


0.1521506373946731 0.12851461235966002
Epoch:  48


0.15216380760476395 0.13229734876326152
Epoch:  49


0.1514452156182882 0.12662160503012793
Epoch    49: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  50


0.1511142205547642 0.12547644866364344
Epoch:  51


0.15079415649981112 0.12578330508300237
Epoch:  52


0.1494952758421769 0.1254678304706301
Epoch:  53


0.15080363524926677 0.12591080686875752
Epoch:  54


0.14871504741746025 0.12558263753141677
Epoch:  55


0.1500913436348374 0.1256459898182324
Epoch:  56


0.14919336099882383 0.12530344192470824
Epoch:  57


0.15010832532032117 0.12576389632054738
Epoch:  58


0.14968308645325737 0.1262426535998072
Epoch:  59


0.14904952008981961 0.12581403872796468
Epoch:  60


0.1490760791946102 0.12556701579264232
Epoch:  61


0.1483465062605368 0.12547169838632857
Epoch:  62


0.1477652389455486 0.12566120603254863
Epoch    62: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  63


0.14952159693112244 0.12540599490915025
Epoch:  64


0.148437196741233 0.1252765240413802
Epoch:  65


0.14969253499765653 0.12564622717244284
Epoch:  66


0.14921964664716977 0.12577824401003973
Epoch:  67


0.14820254896138166 0.12546763462679728
Epoch:  68


0.14787548699894468 0.12518845924309321
Epoch:  69


0.14925063582690987 0.12536049847091948
Epoch:  70


0.14884744866474256 0.12526361963578633
Epoch:  71


0.148714649516183 0.12519552452223642
Epoch:  72


0.1474098317526482 0.12564301490783691
Epoch:  73


0.1480121817943212 0.12529028526374272
Epoch:  74


0.14815419872064847 0.12511393214975083
Epoch:  75


0.14710906027136622 0.12518238489116942
Epoch:  76


0.1478763548909007 0.12501143983432225
Epoch:  77


0.14830901695264354 0.12502065088067735
Epoch:  78


0.14857616094318596 0.12517885438033513
Epoch:  79


0.14828494553630417 0.1251827957374709
Epoch:  80


0.1498099162771895 0.12493695744446345
Epoch:  81


0.14994344397171125 0.12572214539561952
Epoch:  82


0.14804521887689023 0.12566640440906798
Epoch:  83


0.14919673872960582 0.12530869990587234
Epoch:  84


0.14772347662899946 0.12517651809113367
Epoch:  85


0.14770694842209686 0.1252824142575264
Epoch:  86


0.14885219730235436 0.12508186910833632
Epoch    86: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  87


0.147251639414478 0.1252341206584658
Epoch:  88


0.14892320616825208 0.12528354674577713
Epoch:  89


0.14852585663666595 0.12504116765090398
Epoch:  90


0.14713418081000046 0.12516202564750398
Epoch:  91


0.14808719907258008 0.12555498204060964
Epoch:  92


0.1475784806786357 0.12529961977686202
Epoch    92: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  93


0.14802700764424093 0.1253440050142152
Epoch:  94


0.14880899400324435 0.12514122256210872
Epoch:  95


0.148465488810797 0.12515441115413392
Epoch:  96


0.14739433776687932 0.1254909804889134
Epoch:  97


0.14752949572898247 0.12544676874365127
Epoch:  98


0.14788903577907667 0.12524579678262984
Epoch    98: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  99


0.14804054595328667 0.12510433473757335
