In [1]:
# Parameters
until_x = 11


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [12]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [13]:
# Instantiate the model
model = Task5Model(31).to(device)

In [14]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6035197530243848 0.4641105319772448
Epoch:  1


0.27480899683527044 0.20581242655004775
Epoch:  2


0.17581599105048823 0.1725838269506182
Epoch:  3


0.16347964067716855 0.15245818666049413
Epoch:  4


0.15997479251913121 0.23512597594942367
Epoch:  5


0.15848281818467216 0.13701019329684122
Epoch:  6


0.15671943732210109 0.13058631441422872
Epoch:  7


0.15485922831135826 0.1408158955829484
Epoch:  8


0.1562180603678162 0.1399375040616308
Epoch:  9


0.1542844075608898 0.1290698147245816
Epoch:  10


0.15375470511011174 0.14002708452088491
Epoch:  11


0.1539260334259755 0.1348309080515589
Epoch:  12


0.15421535799632202 0.13518927139895304
Epoch:  13


0.15251440333353505 0.12993740716150828
Epoch:  14


0.15184699119748296 0.12604042142629623
Epoch:  15


0.15114993098619822 0.1287734391433852
Epoch:  16


0.15118040305537148 0.1316693595477513
Epoch:  17


0.14947434252983816 0.12993577867746353
Epoch:  18


0.1499070923876118 0.12965093978813716
Epoch:  19


0.14908068445888725 0.1279768773487636
Epoch:  20


0.14945000530900182 0.12594483367034368
Epoch:  21


0.14853608326331988 0.12689074661050523
Epoch:  22


0.1479900841777389 0.13093494517462595
Epoch:  23


0.14917916985782417 0.1276900161589895
Epoch:  24


0.14818861114012227 0.1261542130793844
Epoch:  25


0.14780363883521105 0.13771067240408488
Epoch:  26


0.14739533131187027 0.12381907126733235
Epoch:  27


0.1480674502011892 0.1271444314292499
Epoch:  28


0.14665003201446017 0.12908996215888432
Epoch:  29


0.1480037324331902 0.12581805459090642
Epoch:  30


0.14653269745208122 0.13036685436964035
Epoch:  31


0.14727702334120468 0.13284701428243093
Epoch:  32


0.14655433836820964 0.1296111579452242
Epoch    32: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  33


0.1452263919888316 0.12290200165339879
Epoch:  34


0.1442272864483498 0.12254007054226738
Epoch:  35


0.14435936027281993 0.1222676198397364
Epoch:  36


0.14361960944291707 0.1216810867190361
Epoch:  37


0.14314935295968442 0.12211692226784569
Epoch:  38


0.14274686977669998 0.12160910453115191
Epoch:  39


0.14146555960178375 0.12141212608133044
Epoch:  40


0.1437876256736549 0.12151948788336345
Epoch:  41


0.14398616230165637 0.12184375737394605
Epoch:  42


0.14255626177465594 0.12269297561475209
Epoch:  43


0.14366878327485677 0.12158496571438653
Epoch:  44


0.1422979259813154 0.12232320329972676
Epoch:  45


0.14278845811212384 0.12171329770769392
Epoch    45: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  46


0.14278916611864761 0.12195634096860886
Epoch:  47


0.14174933771829348 0.12326859044177192
Epoch:  48


0.14123123964747866 0.12173569521733693
Epoch:  49


0.14207405254647537 0.12158398010901042
Epoch:  50


0.14294844583885088 0.1215111283319337
Epoch:  51


0.14196841660383586 0.12167177455765861
Epoch    51: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  52


0.1422805419644794 0.12143553793430328
Epoch:  53


0.14187109389820615 0.1222074744956834
Epoch:  54


0.14242029351157112 0.12153901691947665
Epoch:  55


0.14176601413133982 0.12168253532477788
Epoch:  56


0.14088385209843918 0.12313879707029887
Epoch:  57


0.14141450862626773 0.12378260067531041
Epoch    57: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  58


0.14150039005923915 0.12158863885062081
Epoch:  59


0.1415182706472036 0.12177712363856179
Epoch:  60


0.140844818909426 0.12268983253410884
Epoch:  61


0.14212006894317833 0.12285315138953072
Epoch:  62


0.14258123773175316 0.12150154475654874
Epoch:  63


0.1417906413207183 0.12234915047883987
Epoch    63: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  64
