In [1]:
# Parameters
until_x = 15


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [12]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [13]:
# Instantiate the model
model = Task5Model(31).to(device)

In [14]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.612296261497446 0.4280765950679779
Epoch:  1


0.27998560828131597 0.16379510930606297
Epoch:  2


0.17398485460796873 0.14790445140429906
Epoch:  3


0.1634868520337182 0.14302193799189158
Epoch:  4


0.1593636258228405 0.1347439725484167
Epoch:  5


0.15671176322408625 0.13747658154794148
Epoch:  6


0.15551859261216344 0.13138114448104585
Epoch:  7


0.1553691327571869 0.14181486623627798
Epoch:  8


0.15436558747613752 0.13197364338806697
Epoch:  9


0.153319659265312 0.129556305706501
Epoch:  10


0.15200787742395658 0.12936289395604814
Epoch:  11


0.1517701777251991 0.13104241022041865
Epoch:  12


0.1502279836583782 0.1290309386593955
Epoch:  13


0.15097508720449498 0.12650056715522492
Epoch:  14


0.1514230410794954 0.12748430562870844
Epoch:  15


0.15095840998598048 0.1290239542722702
Epoch:  16


0.14970400848904172 0.12615729123353958
Epoch:  17


0.14959049869228053 0.127322219312191
Epoch:  18


0.1483141284536671 0.12828585718359267
Epoch:  19


0.1486638764271865 0.13768112233706883
Epoch:  20


0.14773618288942286 0.1271435362952096
Epoch:  21


0.14807438729582606 0.12729800705398833
Epoch:  22


0.14717497898114695 0.1265254563518933
Epoch    22: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  23


0.14558663360170415 0.12078869236367089
Epoch:  24


0.14496616052614675 0.12094724178314209
Epoch:  25


0.14378365391009562 0.12089520799262184
Epoch:  26


0.14473648127671834 0.12026649713516235
Epoch:  27


0.14482871583990148 0.12081353047064372
Epoch:  28


0.14419697305640658 0.12086777708360127
Epoch:  29


0.1427448025426349 0.1202487125992775
Epoch:  30


0.14356911504590833 0.12025824508496694
Epoch:  31


0.14262954689360954 0.12031716427632741
Epoch:  32


0.1438664729530747 0.12058875922645841
Epoch:  33


0.1439474945938265 0.12043090696845736
Epoch:  34


0.14211058656911593 0.1201695448585919
Epoch:  35


0.14211732753225276 0.12081076524087361
Epoch:  36


0.14197176896237038 0.12109523798738207
Epoch:  37


0.14270737445032275 0.12086053937673569
Epoch:  38


0.14211467593102842 0.1201605104974338
Epoch:  39


0.14275342991223205 0.12043349125555583
Epoch:  40


0.14188924834534927 0.12091256890978132
Epoch    40: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  41


0.1423815802947895 0.12067236644881112
Epoch:  42


0.1417057630983559 0.12063294542687279
Epoch:  43


0.14231528825051076 0.12048594227858953
Epoch:  44


0.14229734241962433 0.12049163984400886
Epoch:  45


0.14117948348457748 0.12049160791294915
Epoch:  46


0.14179813902120333 0.1202902038182531
Epoch    46: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  47


0.14167245517711383 0.12039584453616824
Epoch:  48


0.14161480641042865 0.12032940983772278
Epoch:  49


0.14220953672318845 0.12030555840049471
Epoch:  50


0.14247357482845718 0.12049561845404762
Epoch:  51


0.1408297298727809 0.12035862995045525
Epoch:  52


0.14120618112989375 0.12041488822017397
Epoch    52: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  53


0.14200981243236646 0.12032999417611531
Epoch:  54


0.14132793408793373 0.12026626510279519
Epoch:  55


0.14179555105196462 0.12029312125274114
Epoch:  56


0.14136684786629033 0.12022447053875242
Epoch:  57


0.1417482974561485 0.12041819627795901
Epoch:  58


0.14204487728106008 0.12038016212838036
Epoch    58: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  59


0.14203370624297373 0.12011153038058962
Epoch:  60


0.1418741832713823 0.12042064113276345
Epoch:  61


0.14104685348433418 0.12038236111402512
Epoch:  62


0.14243752408672022 0.12028285435267858
Epoch:  63


0.14058892308054743 0.12025626110179084
Epoch:  64


0.14265745636579152 0.12035570187228066
Epoch:  65


0.14056656711004875 0.12042200139590672
Epoch:  66


0.14163412395361308 0.12026790316615786
Epoch:  67


0.14279322205363093 0.1202615105680057
Epoch:  68


0.1421819087621328 0.12017523923090526
Epoch:  69


0.14198619973015142 0.12029886245727539
Epoch:  70


0.1414334923028946 0.1203027069568634
Epoch:  71


0.14183603871513056 0.12024672010115214
Epoch:  72


0.14217362935478622 0.12036838276045662
Epoch:  73


0.14111017556609334 0.1203828901052475
Epoch:  74


0.1417554395424353 0.12026509323290416
Epoch:  75


0.14261185720160202 0.12029943615198135
Epoch:  76


0.14143670007989212 0.12020560886178698
Epoch:  77


0.14145524074902405 0.12036187308175224
Epoch:  78


0.141570480288686 0.12034425778048378
Epoch:  79


0.14133299081712156 0.12030161810772759
Epoch:  80


0.14170151626741564 0.1201599080647741
Epoch:  81


0.1411889747187898 0.12016854754516057
Epoch:  82


0.14136642580096787 0.12029896037919181
Epoch:  83


0.14091246071699504 0.12042088274444852
Epoch:  84
