In [1]:
# Parameters
until_x = 0


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [12]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [13]:
# Instantiate the model
model = Task5Model(31).to(device)

Downloading: "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth" to /root/.cache/torch/checkpoints/mobilenet_v2-b0353104.pth
  0%|                                                                                                                                                                       | 0/14212972 [00:00<?, ?it/s]

 46%|███████████████████████████████████████████████████████████████████▎                                                                               | 6504448/14212972 [00:00<00:00, 64994954.77it/s]

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 14212972/14212972 [00:00<00:00, 84308733.75it/s]




In [14]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6068785416113364 0.4106622040271759
Epoch:  1


0.28042326182932464 0.18924077706677572
Epoch:  2


0.18605029260790026 0.18917905858584813
Epoch:  3


0.1777831456951193 0.22019881648676737
Epoch:  4


0.1757424719430305 0.15960666750158584
Epoch:  5


0.17465515797202652 0.1665288656949997
Epoch:  6


0.17261046694742666 0.16159206203051976
Epoch:  7


0.17086449547393903 0.16077972097056253
Epoch:  8


0.16991206641132767 0.15539732575416565
Epoch:  9


0.16917432321084513 0.15073248956884658
Epoch:  10


0.16937075555324554 0.15373154623167856
Epoch:  11


0.1691878925304155 0.14840158820152283
Epoch:  12


0.16665148694772977 0.14791730897767202
Epoch:  13


0.16616988906989227 0.1453898942896298
Epoch:  14


0.16482246324822708 0.14240839332342148
Epoch:  15


0.16445216657342138 0.14169700337307795
Epoch:  16


0.16191294144939733 0.14229160866567067
Epoch:  17


0.16373467767560804 0.15552519687584468
Epoch:  18


0.16156411775060603 0.1431957706809044
Epoch:  19


0.1614591817598085 0.13618447099413192
Epoch:  20


0.16133742638536402 0.13482354155608586
Epoch:  21


0.16051957333410108 0.14452699891158513
Epoch:  22


0.15899797589392275 0.13855460286140442
Epoch:  23


0.15882968056846308 0.13912309919084823
Epoch:  24


0.16081915554162618 0.13945661591632025
Epoch:  25


0.15884777581369555 0.13397858611174993
Epoch:  26


0.1582360356240659 0.1338944552200181
Epoch:  27


0.15890443647229993 0.13405227767569677
Epoch:  28


0.15726148357262482 0.13491994994027273
Epoch:  29


0.15846026708950867 0.133539973625115
Epoch:  30


0.1568909569366558 0.13437716662883759
Epoch:  31


0.1578788008238818 0.13797335220234735
Epoch:  32


0.1581224090344197 0.133870324918202
Epoch:  33


0.15697721895333883 0.1319593893630164
Epoch:  34


0.1566070225593206 0.14081070678574698
Epoch:  35


0.15682951542171272 0.13795207121542521
Epoch:  36


0.15607100322439865 0.1332480779715947
Epoch:  37


0.15447277555594574 0.1338635374392782
Epoch:  38


0.155643614160048 0.14060976249831064
Epoch:  39


0.15515704090530807 0.1342684988464628
Epoch    39: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  40


0.1532492762481844 0.12839189916849136
Epoch:  41


0.15368114331284086 0.12804108645234788
Epoch:  42


0.1537936619810156 0.1279735043644905
Epoch:  43


0.15275507239071098 0.12761649276529038
Epoch:  44


0.1524254309164511 0.12772519034998758
Epoch:  45


0.1519045894210403 0.12728846392461232
Epoch:  46


0.15413515591943586 0.12714898266962596
Epoch:  47


0.15275646021237244 0.1269096146736826
Epoch:  48


0.15269401266768173 0.12715701971735274
Epoch:  49


0.1530488717394906 0.12694587877818517
Epoch:  50


0.15247260759005676 0.12751875711338861
Epoch:  51


0.15057519360168561 0.12679938226938248
Epoch:  52


0.15275537162213712 0.1276485781584467
Epoch:  53


0.1515356934553868 0.12671335467270442
Epoch:  54


0.15084049186191043 0.1265467990721975
Epoch:  55


0.15163883446036158 0.12608340169702256
Epoch:  56


0.15109641205620122 0.1265276319214276
Epoch:  57


0.15190429824429588 0.12696152925491333
Epoch:  58


0.15108629255681424 0.12635413770164763
Epoch:  59


0.15201875728529854 0.126999098275389
Epoch:  60


0.15177891101386096 0.12598193011113576
Epoch:  61


0.15148121642099843 0.12637151458433696
Epoch:  62


0.15199849009513855 0.12634611981255667
Epoch:  63


0.1505967374588992 0.12630522038255418
Epoch:  64


0.15127031263467428 0.12655315548181534
Epoch:  65


0.1506772858870996 0.12614005591188157
Epoch:  66


0.15010692702757344 0.12677003656114852
Epoch    66: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  67


0.15066554618848338 0.1260727196931839
Epoch:  68


0.14945935518354983 0.12615086989743368
Epoch:  69


0.15145592391490936 0.12599780623401916
Epoch:  70


0.1512575205918905 0.126115620136261
Epoch:  71


0.15102560093273987 0.12613508531025477
Epoch:  72


0.14989448197790095 0.12578943903957093
Epoch:  73


0.15014877915382385 0.12610266357660294
Epoch:  74


0.15083529780039917 0.12579891404935292
Epoch:  75


0.151457998398188 0.12574969977140427
Epoch:  76


0.15041847889487808 0.1257934101990291
Epoch:  77


0.15161282549033295 0.125681534409523
Epoch:  78


0.15030578584284396 0.1257589097533907
Epoch:  79


0.14927825090047475 0.1257915741630963
Epoch:  80


0.14979677103661201 0.1257692841546876
Epoch:  81


0.15022470741658597 0.12568924469607218
Epoch:  82


0.1501361570648245 0.12581774592399597
Epoch:  83


0.15023625984385208 0.1260246740920203
Epoch    83: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  84


0.1508361354872987 0.1258196798818452
Epoch:  85


0.15019425508138295 0.1257961562701634
Epoch:  86


0.15137112704483238 0.12552259223801748
Epoch:  87


0.15199312849624738 0.12556242304188864
Epoch:  88


0.150021591299289 0.1260209626385144
Epoch:  89


0.1494925324175809 0.12554845533200673
Epoch:  90


0.1504043968948158 0.12581992149353027
Epoch:  91


0.15050276832000628 0.12596705981663295
Epoch:  92


0.15054334982021436 0.12545958587101527
Epoch:  93


0.1498316051186742 0.12543963215180806
Epoch:  94


0.1515339242445456 0.12572367382901056
Epoch:  95


0.14959256189900474 0.12555976850645884
Epoch:  96


0.15017330606241483 0.12568575463124684
Epoch:  97


0.15045360375095057 0.12568922340869904
Epoch:  98


0.1506335469516548 0.12580612621137074
Epoch:  99


0.1504133118165506 0.12554635213954107
Epoch    99: reducing learning rate of group 0 to 1.0000e-07.
