In [1]:
# Parameters
until_x = 12


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 96, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 96, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 96, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [12]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [13]:
# Instantiate the model
model = Task5Model(31).to(device)

In [14]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6518821740150451 0.7329739212989808
Epoch:  1


0.42363962292671203 0.3322059214115143
Epoch:  2


0.21609222948551177 0.17410574853420258
Epoch:  3


0.17109248995780946 0.14235796630382538
Epoch:  4


0.16157737970352173 0.15273249447345733
Epoch:  5


0.15910090148448944 0.13735495805740355
Epoch:  6


0.15719605326652528 0.13382259309291838
Epoch:  7


0.15589716851711274 0.13435575664043425
Epoch:  8


0.15546928405761717 0.13196162432432174
Epoch:  9


0.15312815129756926 0.12932761907577514
Epoch:  10


0.1529581904411316 0.13016001433134078
Epoch:  11


0.15244558691978455 0.13340397775173188
Epoch:  12


0.15180594027042388 0.12894694805145263
Epoch:  13


0.15181501924991608 0.13246609270572662
Epoch:  14


0.14980655133724213 0.12944846451282502
Epoch:  15


0.15134060084819795 0.13015994429588318
Epoch:  16


0.14999784767627716 0.12789589464664458
Epoch:  17


0.14942773759365083 0.13817155063152314
Epoch:  18


0.1484711629152298 0.12884212881326676
Epoch:  19


0.14887304067611695 0.12875473350286484
Epoch:  20


0.14876713395118712 0.13283451348543168
Epoch:  21


0.14810845971107484 0.12705180794000626
Epoch:  22


0.14924391865730285 0.12859114855527878
Epoch:  23


0.1477784788608551 0.13148339241743087
Epoch:  24


0.14704701721668242 0.1233941987156868
Epoch:  25


0.14699630677700043 0.12687348425388337
Epoch:  26


0.14715028345584868 0.12530809491872788
Epoch:  27


0.145586718916893 0.12609721571207047
Epoch:  28


0.14769574940204622 0.126680326461792
Epoch:  29


0.14660721480846406 0.12705750912427902
Epoch:  30


0.14483180165290832 0.13525620400905608
Epoch    30: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  31


0.14401717007160186 0.12130908668041229
Epoch:  32


0.14334985315799714 0.12143778651952744
Epoch:  33


0.14253035962581634 0.1214844524860382
Epoch:  34


0.1427113878726959 0.12155295461416245
Epoch:  35


0.14204027235507966 0.12144495844841004
Epoch:  36


0.14239340364933015 0.12208462953567505
Epoch:  37


0.14081928193569182 0.12122185975313186
Epoch:  38


0.14286816060543062 0.12102220058441163
Epoch:  39


0.1406095242500305 0.12112627625465393
Epoch:  40


0.14238337218761443 0.12130990028381347
Epoch:  41


0.14161161124706267 0.12203972786664963
Epoch:  42


0.1419606947898865 0.12138605117797852
Epoch:  43


0.1411329162120819 0.12228680402040482
Epoch:  44


0.14074499011039734 0.1223624125123024
Epoch    44: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  45


0.1415357208251953 0.12198374569416046
Epoch:  46


0.14103914737701417 0.12174710184335709
Epoch:  47


0.1423234897851944 0.12177592515945435
Epoch:  48


0.14120966732501983 0.1218559518456459
Epoch:  49


0.14151716232299805 0.1216015100479126
Epoch:  50


0.1401548033952713 0.12148755490779876
Epoch    50: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  51


0.14036823332309722 0.12154749780893326
Epoch:  52


0.14037087976932525 0.12138019800186158
Epoch:  53


0.14145042836666108 0.12152039855718613
Epoch:  54


0.1402593261003494 0.12170614004135132
Epoch:  55


0.13983694911003114 0.12154447734355926
Epoch:  56


0.14082991540431977 0.12153760641813278
Epoch    56: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  57


0.14039577424526214 0.12159949541091919
Epoch:  58


0.14152547538280488 0.12151480317115784
Epoch:  59


0.141312295794487 0.12140822410583496
Epoch:  60


0.1404734420776367 0.12153496742248535
Epoch:  61


0.1417154061794281 0.12161996215581894
Epoch:  62


0.14120611250400544 0.12156243473291398
Epoch    62: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  63
