In [1]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor



In [2]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [3]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [4]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [5]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [6]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [7]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [8]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [9]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [10]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [11]:
until_x = None

In [12]:
# Parameters
until_x = 6


In [13]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [14]:
# Instantiate the model
model = Task5Model(31).to(device)

In [15]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [16]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6369645579441173 0.5489735007286072
Epoch:  1


0.3332349666067072 0.20159438252449036
Epoch:  2


0.19068845260787654 0.1778473321880613
Epoch:  3


0.17868115128697576 0.1660502233675548
Epoch:  4


0.17565145363678802 0.15966816459383285
Epoch:  5


0.1721261328942067 0.1575888672045299
Epoch:  6


0.17118749143304052 0.15489810279437474
Epoch:  7


0.1694080277874663 0.1642081673656191
Epoch:  8


0.16773839979558378 0.1499563870685441
Epoch:  9


0.16712870670331492 0.16277857550552913
Epoch:  10


0.16639761344806567 0.1524467979158674
Epoch:  11


0.16445223423274788 0.14640703797340393
Epoch:  12


0.16455155369397756 0.14121056454522268
Epoch:  13


0.1629601338425198 0.14533367220844542
Epoch:  14


0.16170917974935994 0.14311320334672928
Epoch:  15


0.16252071068093582 0.14531687647104263
Epoch:  16


0.16352998686803355 0.13957878415073668
Epoch:  17


0.16144468937371229 0.1411047875881195
Epoch:  18


0.1614588724600302 0.1428723122392382
Epoch:  19


0.16003905115900813 0.13738141102450235
Epoch:  20


0.16062064267493584 0.14002848523003714
Epoch:  21


0.159556493968577 0.143354204084192
Epoch:  22


0.15892187002542857 0.13761644810438156
Epoch:  23


0.15885501495889714 0.1364936019693102
Epoch:  24


0.15735595854553017 0.13872223666736058
Epoch:  25


0.15738303677455798 0.13660643888371332
Epoch:  26


0.15593969217828801 0.13594206316130503
Epoch:  27


0.1566325297226777 0.1363282214318003
Epoch:  28


0.15598811692482717 0.13516214809247426
Epoch:  29


0.15617179669238426 0.13236346734421595
Epoch:  30


0.15639268506217646 0.13130491439785277
Epoch:  31


0.15478379259238373 0.13113528277192796
Epoch:  32


0.1548742591529279 0.12943445891141891
Epoch:  33


0.1538358673050597 0.13318402426583426
Epoch:  34


0.15506087606017654 0.1299190361584936
Epoch:  35


0.1536425311823149 0.1309027565377099
Epoch:  36


0.15406497346388326 0.13321733048983983
Epoch:  37


0.154055104465098 0.1316078496830804
Epoch:  38


0.15301917896077438 0.12785817789179937
Epoch:  39


0.15221152998305656 0.13147033751010895
Epoch:  40


0.15382672483856613 0.1296012220638139
Epoch:  41


0.15312906334529053 0.1298025580389159
Epoch:  42


0.15236471069825663 0.1282859668135643
Epoch:  43


0.1517193611409213 0.13603551472936357
Epoch:  44


0.15254294872283936 0.1329174701656614
Epoch    44: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  45


0.14974007292373762 0.1268495268055371
Epoch:  46


0.1500022302608232 0.12553965832505906
Epoch:  47


0.14993304819674105 0.12559632637671062
Epoch:  48


0.14983402675873525 0.1250165445463998
Epoch:  49


0.1493918585616189 0.12484992082629885
Epoch:  50


0.146921824764561 0.12488591883863721
Epoch:  51


0.1489488035440445 0.124518911753382
Epoch:  52


0.14716025502295108 0.12463180401495524
Epoch:  53


0.14791322036369428 0.12502780663115637
Epoch:  54


0.14836400465385333 0.12497038926397051
Epoch:  55


0.1485290849531019 0.12430437122072492
Epoch:  56


0.1473930039115854 0.12430388373988015
Epoch:  57


0.14845431535630613 0.12397976858275277
Epoch:  58


0.14828180219676043 0.12430416154009956
Epoch:  59


0.14735195403163498 0.12367568271500724
Epoch:  60


0.14690495181728053 0.12420463987759181
Epoch:  61


0.1481019415565439 0.12373524478503636
Epoch:  62


0.14752592871317993 0.12424512420381818
Epoch:  63


0.1468169979147009 0.12371439273868289
Epoch:  64


0.14721419481006828 0.12409331010920661
Epoch:  65


0.14801949987540375 0.12478124563183103
Epoch    65: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  66


0.14577556381354462 0.1244945153594017
Epoch:  67


0.14647536140841408 0.12482878140040807
Epoch:  68


0.1473061651796908 0.1236391184585435
Epoch:  69


0.14737110001009865 0.12423313409090042
Epoch:  70


0.14637758522420316 0.12416826507874898
Epoch:  71


0.1467383809991785 0.12394565237419945
Epoch:  72


0.1473740721876557 0.12355467357805797
Epoch:  73


0.14727434435406248 0.1237797981926373
Epoch:  74


0.1462046262380239 0.12391329556703568
Epoch:  75


0.1467580964436402 0.12407165872199195
Epoch:  76


0.1477417079983531 0.12355579222951617
Epoch:  77


0.1475616959301201 0.12395133929593223
Epoch:  78


0.14709646838742332 0.12362341795648847
Epoch    78: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  79


0.148119288521844 0.12337345417056765
Epoch:  80


0.14609428676399025 0.12376074705805097
Epoch:  81


0.14593518021944407 0.12434877561671394
Epoch:  82


0.14647387854150823 0.12346079839127404
Epoch:  83


0.14661105178497932 0.12342618725129537
Epoch:  84


0.1469882154786909 0.12358954229525157
Epoch:  85


0.146459316079681 0.12369812173502785
Epoch    85: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  86


0.1465433554069416 0.12364028181348528
Epoch:  87


0.14723031142273466 0.12365905301911491
Epoch:  88


0.14652198069804423 0.12347981333732605
Epoch:  89


0.14690276215205322 0.12413464805909566
Epoch:  90


0.14600622251227097 0.12406544387340546
Epoch:  91


0.14643482098708283 0.12354179578168052
Epoch    91: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  92


0.1464974731206894 0.1235012337565422
Epoch:  93


0.1469734190283595 0.12363579762833458
Epoch:  94


0.1462239743889989 0.12337046010153634
Epoch:  95


0.14640189908646248 0.12372328979628426
Epoch:  96


0.147314363637486 0.12386146932840347
Epoch:  97


0.14733505289296847 0.12383986796651568
Epoch:  98


0.14686102601321968 0.12410171968596322
Epoch:  99


0.14552255135935707 0.12380567938089371
