In [1]:
# Parameters
until_x = 2


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [12]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [13]:
# Instantiate the model
model = Task5Model(31).to(device)

In [14]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6065604735065151 0.4620026137147631
Epoch:  1


0.2803339104394655 0.21036741776125772
Epoch:  2


0.1868954216306274 0.20472321552889688
Epoch:  3


0.17922431190271634 0.1654427264417921
Epoch:  4


0.1763075058524673 0.16464561223983765
Epoch:  5


0.17457135504967458 0.16885365332875932
Epoch:  6


0.17118678745385763 0.1580591286931719
Epoch:  7


0.17122385550189662 0.1588946602174214
Epoch:  8


0.16767497562073372 0.15218588071210043
Epoch:  9


0.1663869796572505 0.1551260266985212
Epoch:  10


0.16417442584359968 0.14436968735286168
Epoch:  11


0.16453795215568026 0.15089145728519984
Epoch:  12


0.16375822312123067 0.13859117669718607
Epoch:  13


0.16214300007433505 0.14997366070747375
Epoch:  14


0.16082374308560346 0.13591812338147843
Epoch:  15


0.16065436843279246 0.13967986617769515
Epoch:  16


0.16015018804653272 0.1389862224459648
Epoch:  17


0.16086520657346054 0.13263082930019923
Epoch:  18


0.1591983969147141 0.13663118864808763
Epoch:  19


0.15746602697952375 0.13882640217031753
Epoch:  20


0.1576038252662968 0.1332847358925002
Epoch:  21


0.15630689545257673 0.1345734362091337
Epoch:  22


0.15639026543578585 0.13541863858699799
Epoch:  23


0.15630320640834602 0.1312247035758836
Epoch:  24


0.15567829037034833 0.129341222345829
Epoch:  25


0.1550186542240349 0.13563343031065805
Epoch:  26


0.15598436102673813 0.13332225808075496
Epoch:  27


0.153979890652605 0.13470231209482467
Epoch:  28


0.1530515072313515 0.1314225122332573
Epoch:  29


0.15493706271455093 0.1325828241450446
Epoch:  30


0.15336965950759682 0.13095837512186595
Epoch    30: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  31


0.1533867591941679 0.12696646686111177
Epoch:  32


0.15084342940433607 0.1265421062707901
Epoch:  33


0.15325532330049052 0.12681113289935247
Epoch:  34


0.150669752746015 0.12660245490925653
Epoch:  35


0.15181462748630628 0.12583936431578227
Epoch:  36


0.1506732641845136 0.12610542242016112
Epoch:  37


0.15087900290618073 0.12609736089195525
Epoch:  38


0.1512998402924151 0.12637284291642054
Epoch:  39


0.14990730543394346 0.12717112685952867
Epoch:  40


0.1513052816326554 0.12622105010918208
Epoch:  41


0.14965376137076197 0.12592740250485285
Epoch    41: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  42


0.15036082670495315 0.12587271630764008
Epoch:  43


0.15070142335182912 0.125648238829204
Epoch:  44


0.15070756343570915 0.12568987799542292
Epoch:  45


0.1508217131769335 0.12602743080684117
Epoch:  46


0.1510361569958764 0.12585808443171637
Epoch:  47


0.1504743610684936 0.12587827550513403
Epoch:  48


0.1504447560858082 0.12577553412743978
Epoch:  49


0.15081950377773595 0.1259143618600709
Epoch    49: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  50


0.14939219202544238 0.12599281860249384
Epoch:  51


0.14947971258614515 0.12572166962283
Epoch:  52


0.15073513622219498 0.12599591272217886
Epoch:  53


0.15015446455092044 0.12569529988936015
Epoch:  54


0.15013886021601186 0.1259220838546753
Epoch:  55


0.15041844103787397 0.12622222942965372
Epoch    55: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  56


0.15057072285059336 0.12582493360553468
Epoch:  57


0.15256111082193013 0.12597780781132834
Epoch:  58


0.15054012069831024 0.12555089167186193
Epoch:  59


0.14999997535267393 0.12581059975283487
Epoch:  60


0.15100362655278798 0.12570561575038092
Epoch:  61


0.14899957784124324 0.12597149184771947
Epoch:  62


0.150105245210029 0.1260198269571577
Epoch:  63


0.15121105877128807 0.12579609985862458
Epoch:  64


0.1503752337114231 0.1258002764412335
Epoch    64: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  65


0.1502614126012132 0.12568709892886026
Epoch:  66


0.14931618402133118 0.12593375678573335
Epoch:  67


0.15046409215476061 0.1261505431362561
Epoch:  68


0.150402522167644 0.12583924404212407
Epoch:  69


0.15107971268731193 0.12571232127291815
Epoch:  70


0.15106316676010956 0.12545924101557052
Epoch:  71


0.150538458614736 0.12553773820400238
Epoch:  72


0.1496798275290309 0.1259588184101241
Epoch:  73


0.1504405499310107 0.1255323099238532
Epoch:  74


0.15047694501039144 0.12571932056120463
Epoch:  75


0.14992763182601412 0.12584153775657927
Epoch:  76


0.15051326477849805 0.12556689764772141
Epoch:  77


0.15077108909954895 0.12580018596989767
Epoch:  78


0.15068995026317802 0.12590851528303965
Epoch:  79


0.15131342894322164 0.12573047514472688
Epoch:  80


0.14983014681854764 0.1257188000849315
Epoch:  81


0.15056130330304843 0.12588204124144145
Epoch:  82


0.15029134661764712 0.12588085659912654
Epoch:  83


0.1510701932617136 0.1257677408201354
Epoch:  84


0.15043878998305346 0.1259242370724678
Epoch:  85


0.14954653865582235 0.1259221551673753
Epoch:  86


0.1513296426148028 0.12585211119481496
Epoch:  87


0.15126431149405403 0.12587285893304007
Epoch:  88


0.14875398576259613 0.12569023881639754
Epoch:  89


0.15028225690931887 0.1256077779190881
Epoch:  90


0.14987528847681508 0.12587669491767883
Epoch:  91


0.14910054428351893 0.1255710827452796
Epoch:  92


0.1499939170238134 0.12592107376882009
Epoch:  93


0.15035670592978195 0.12574209911482676
Epoch:  94


0.15124783644805084 0.12575538137129375
Epoch:  95
