In [1]:
# Parameters
until_x = 4


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [12]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [13]:
# Instantiate the model
model = Task5Model(31).to(device)

In [14]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6064241093558234 0.4498261639050075
Epoch:  1


0.2755916348180255 0.18060128390789032
Epoch:  2


0.1822117567062378 0.20071142060416086
Epoch:  3


0.17494908418204333 0.17326130611555918
Epoch:  4


0.1704105187106777 0.15719820771898543
Epoch:  5


0.16881059633718953 0.15119841269084386
Epoch:  6


0.16815641200220263 0.16665057837963104
Epoch:  7


0.16484387984146942 0.16214843094348907
Epoch:  8


0.16497918037143913 0.15557133087090083
Epoch:  9


0.1608624901320483 0.14564621235643113
Epoch:  10


0.1610048014569927 0.1364510634115764
Epoch:  11


0.1607063259627368 0.13380730471440724
Epoch:  12


0.15960766536158486 0.14983013698032924
Epoch:  13


0.15913351605067383 0.13487510489565985
Epoch:  14


0.15827651805168874 0.134804972580501
Epoch:  15


0.15888991992215853 0.1366033245410238
Epoch:  16


0.15718995638795802 0.13463036290236882
Epoch:  17


0.1558318540856645 0.13177909595625742
Epoch:  18


0.15640497207641602 0.1311456294996398
Epoch:  19


0.15576317423098796 0.14109917836529867
Epoch:  20


0.15615538809750532 0.1316317202789443
Epoch:  21


0.15457786458569603 0.13000855914184026
Epoch:  22


0.15405536382584958 0.13579128895487105
Epoch:  23


0.15462149075559667 0.13306941092014313
Epoch:  24


0.15445949862132202 0.13095791105713164
Epoch:  25


0.15494518505560384 0.13472335253443038
Epoch:  26


0.15288945951977292 0.13296001617397582
Epoch:  27


0.15280794936257439 0.13069297266857965
Epoch    27: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  28


0.15195299926641825 0.12617263410772597
Epoch:  29


0.1498444177008964 0.1267678982445172
Epoch:  30


0.1511898056880848 0.1261310577392578
Epoch:  31


0.1498082650674356 0.1257877296635083
Epoch:  32


0.15088391304016113 0.1268469180379595
Epoch:  33


0.14947513994332906 0.12565020791121892
Epoch:  34


0.15032666478608106 0.12612559007746832
Epoch:  35


0.14959221111761556 0.12613765788929804
Epoch:  36


0.15024064481258392 0.12506166419812612
Epoch:  37


0.1491115544293378 0.12578498039926803
Epoch:  38


0.15060987263112455 0.12495277396270207
Epoch:  39


0.1503639760855082 0.12609398258583887
Epoch:  40


0.14913362425726814 0.124894203884261
Epoch:  41


0.14906175555409612 0.12525712698698044
Epoch:  42


0.14985547073789546 0.12560831010341644
Epoch:  43


0.14957041474612984 0.12506210803985596
Epoch:  44


0.14839414849474625 0.12477425911596843
Epoch:  45


0.1488860636144071 0.12477683488811765
Epoch:  46


0.14983953736923836 0.12564163442168916
Epoch:  47


0.14895389410289558 0.12596861592360906
Epoch:  48


0.14843906542739352 0.12546460224049433
Epoch:  49


0.14961214927402702 0.12548739569527761
Epoch:  50


0.1497871569685034 0.1253118578876768
Epoch    50: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  51


0.1490439679977056 0.12520597342933928
Epoch:  52


0.1475970543719627 0.12532312955175126
Epoch:  53


0.1488217928925076 0.12491048978907722
Epoch:  54


0.14851443993078695 0.12476691497223717
Epoch:  55


0.14951316408208898 0.12593911588191986
Epoch:  56


0.14778030482498375 0.1248569084065301
Epoch    56: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  57


0.14838456865903493 0.12542629880564554
Epoch:  58


0.1495435459388269 0.12580951090369905
Epoch:  59


0.14789391611073469 0.12501399644783565
Epoch:  60


0.1491115657058922 0.12505427215780532
Epoch:  61


0.14981663267354708 0.12491568390812192
Epoch:  62


0.14848449665146904 0.1253200535263334
Epoch    62: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  63


0.1481757598954278 0.12521351341690337
Epoch:  64


0.14950870581575343 0.12486421742609569
Epoch:  65


0.14782900947171287 0.12510422723633902
Epoch:  66


0.148870489484555 0.12511827903134481
Epoch:  67


0.14846349849894241 0.12470490485429764
Epoch:  68


0.14821277961537643 0.12539978538240706
Epoch:  69


0.1474432381423744 0.12485862097569875
Epoch:  70


0.14832507275246284 0.12465177582842964
Epoch:  71


0.14859150793101336 0.12531172377722605
Epoch:  72


0.1476958696101163 0.12444927330527987
Epoch:  73


0.14892910944448934 0.1253848182303565
Epoch:  74


0.14828632933062477 0.12502515954630716
Epoch:  75


0.1484247325239955 0.12496335804462433
Epoch:  76


0.14939891003273628 0.12460937244551522
Epoch:  77


0.14889305628634789 0.12491275582994733
Epoch:  78


0.14847188140894915 0.12546455327953612
Epoch    78: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  79


0.14836189634091146 0.12520850130489894
Epoch:  80


0.148745811066112 0.12471361138990947
Epoch:  81


0.1490006297826767 0.12498550010578972
Epoch:  82


0.14778707357677254 0.1250627189874649
Epoch:  83


0.1481060015188681 0.12490805770669665
Epoch:  84


0.14831936923233238 0.12521275665078843
Epoch:  85


0.1483124615372838 0.12539815370525634
Epoch:  86


0.14909370968470703 0.12495744547673635
Epoch:  87


0.1478979837250065 0.12477401750428337
Epoch:  88


0.1477034768542728 0.12485429644584656
Epoch:  89


0.14840219028898188 0.12459751431431089
Epoch:  90


0.1485489784060298 0.1250940039753914
Epoch:  91


0.14740207549687978 0.12548673578671046
Epoch:  92


0.1488326067054594 0.12518317252397537
Epoch:  93


0.14802021150653427 0.12514833893094743
Epoch:  94


0.14819983413090576 0.12533408190522874
Epoch:  95


0.14784228278172984 0.12481487223080226
Epoch:  96


0.14842907319197784 0.1252098615680422
Epoch:  97
