In [1]:
# Parameters
until_x = 3


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 96, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 96, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 96, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [12]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [13]:
# Instantiate the model
model = Task5Model(31).to(device)

In [14]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6521834301948547 0.5643704891204834
Epoch:  1


0.43244945526123046 0.26234854459762574
Epoch:  2


0.22851068735122682 0.18779278993606568
Epoch:  3


0.18563915371894837 0.1691790521144867
Epoch:  4


0.1783787751197815 0.17349810600280763
Epoch:  5


0.17666720628738403 0.17231203913688659
Epoch:  6


0.17374468564987183 0.15829547345638276
Epoch:  7


0.17218283534049988 0.1601227790117264
Epoch:  8


0.17189290165901183 0.16817236542701722
Epoch:  9


0.16975375890731811 0.16152510643005372
Epoch:  10


0.16984523713588714 0.15766447484493257
Epoch:  11


0.16890307426452636 0.16044118702411653
Epoch:  12


0.168792200088501 0.15810581147670746
Epoch:  13


0.167007275223732 0.15176329016685486
Epoch:  14


0.1667980992794037 0.1495746672153473
Epoch:  15


0.16478636384010314 0.14971626698970794
Epoch:  16


0.16502766132354738 0.1673620104789734
Epoch:  17


0.16326950669288634 0.15255593359470368
Epoch:  18


0.16290395796298981 0.14145539104938507
Epoch:  19


0.16192365884780885 0.14061320722103118
Epoch:  20


0.15996201813220978 0.14041620194911958
Epoch:  21


0.16004803776741028 0.1433849185705185
Epoch:  22


0.1603948974609375 0.13585647493600844
Epoch:  23


0.15955013036727905 0.13520210534334182
Epoch:  24


0.1574998205900192 0.1337282732129097
Epoch:  25


0.15770874559879303 0.14061616957187653
Epoch:  26


0.15746757984161378 0.13699703216552733
Epoch:  27


0.15770995378494262 0.13388760685920714
Epoch:  28


0.15711923241615294 0.13334344774484636
Epoch:  29


0.15523713529109956 0.13520671874284745
Epoch:  30


0.1546254974603653 0.13322527557611466
Epoch:  31


0.1565551084280014 0.13402323871850969
Epoch:  32


0.1551086860895157 0.1307859092950821
Epoch:  33


0.15448589861392975 0.1317775398492813
Epoch:  34


0.15508995056152344 0.1316637620329857
Epoch:  35


0.15281263649463653 0.12956547737121582
Epoch:  36


0.1537934398651123 0.12952828407287598
Epoch:  37


0.15465319395065308 0.13240764588117598
Epoch:  38


0.15395481288433074 0.1301560878753662
Epoch:  39


0.15373210191726686 0.1301504209637642
Epoch:  40


0.15376731395721435 0.13088851273059846
Epoch:  41


0.15266046226024627 0.13025857955217363
Epoch:  42


0.1508522003889084 0.1298343375325203
Epoch    42: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  43


0.15201473236083984 0.1254894658923149
Epoch:  44


0.15095588862895964 0.12484461963176727
Epoch:  45


0.15015839636325837 0.12556520998477935
Epoch:  46


0.1500014865398407 0.12518261075019838
Epoch:  47


0.14927076637744904 0.12523180693387986
Epoch:  48


0.15015554189682007 0.1255822852253914
Epoch:  49


0.15077586531639098 0.12538652122020721
Epoch:  50


0.14888243794441222 0.12519786655902862
Epoch    50: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  51


0.1490786302089691 0.12504891455173492
Epoch:  52


0.15054079830646516 0.12505511641502381
Epoch:  53


0.1490215265750885 0.12489373981952667
Epoch:  54


0.14931272268295287 0.12487398087978363
Epoch:  55


0.1492585527896881 0.12487350553274154
Epoch:  56


0.14954474568367004 0.12499923408031463
Epoch    56: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  57


0.1488925749063492 0.12487141489982605
Epoch:  58


0.1507314610481262 0.12486752569675445
Epoch:  59


0.14918988227844238 0.12483585029840469
Epoch:  60


0.1497409725189209 0.12501732558012008
Epoch:  61


0.1503703624010086 0.1250218406319618
Epoch:  62


0.14973486721515655 0.12486964613199233
Epoch    62: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  63


0.15009965717792512 0.12482608109712601
Epoch:  64


0.1497833913564682 0.12475473284721375
Epoch:  65


0.15007403671741484 0.12497442811727524
Epoch:  66


0.14936696469783783 0.1251268893480301
Epoch:  67


0.1503628498315811 0.12487836331129074
Epoch:  68


0.14894316732883453 0.12489737868309021
Epoch:  69


0.14996030151844025 0.12492702156305313
Epoch:  70


0.14976904809474945 0.12495952844619751
Epoch    70: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  71


0.14979265749454498 0.12486556470394135
Epoch:  72


0.14910377860069274 0.12501669824123382
Epoch:  73


0.14952141523361206 0.12492117583751679
Epoch:  74


0.14770481884479522 0.12496020197868347
Epoch:  75


0.1487998867034912 0.12496260851621628
Epoch:  76


0.14985850989818572 0.12497346252202987
Epoch:  77


0.14951054871082306 0.12499111741781235
Epoch:  78


0.14969283044338227 0.12480963170528411
Epoch:  79


0.14932423889636992 0.12510619610548018
Epoch:  80


0.14932937622070314 0.12500501722097396
Epoch:  81


0.14896830260753632 0.12484159767627716
Epoch:  82


0.1496173745393753 0.12474950700998307
Epoch:  83


0.14937883973121643 0.1250454977154732
Epoch:  84


0.1491714209318161 0.12485152035951615
Epoch:  85


0.1492840850353241 0.12494052648544311
Epoch:  86


0.14971348404884338 0.1248528704047203
Epoch:  87


0.14909008026123047 0.12503510862588882
Epoch:  88


0.14938768804073332 0.1250094637274742
Epoch:  89


0.14869455575942994 0.12489729076623916
Epoch:  90


0.14821385741233825 0.12496048659086227
Epoch:  91


0.14992282032966614 0.12483934164047242
Epoch:  92


0.14896857738494873 0.12486114054918289
Epoch:  93


0.14946165442466736 0.1248880758881569
Epoch:  94


0.15039514183998107 0.12482418715953827
Epoch:  95


0.1496923553943634 0.12482864707708359
Epoch:  96


0.14949483096599578 0.12493971437215805
Epoch:  97


0.14856357693672181 0.12493953108787537
Epoch:  98


0.14886200785636902 0.12502898275852203
Epoch:  99


0.1502816593647003 0.1250029146671295
