In [1]:
# Parameters
until_x = 2


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [12]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [13]:
# Instantiate the model
model = Task5Model(31).to(device)

In [14]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6086620309868375 0.48145955375262667
Epoch:  1


0.2779789571021054 0.1896773704460689
Epoch:  2


0.18691622808172897 0.19257442014557974
Epoch:  3


0.17741406286084974 0.16199939165796554
Epoch:  4


0.1728852359024254 0.15784840285778046
Epoch:  5


0.170641195935172 0.1605815121105739
Epoch:  6


0.1708949671403782 0.15620740609509604
Epoch:  7


0.16729321109282003 0.17306028306484222
Epoch:  8


0.16715102421270833 0.1537564375570842
Epoch:  9


0.1661280151960012 0.14485998558146612
Epoch:  10


0.1660753412021173 0.14333166288478033
Epoch:  11


0.16264437863955628 0.14072302835328238
Epoch:  12


0.16161143699207822 0.14548321068286896
Epoch:  13


0.1616606873434943 0.14341838657855988
Epoch:  14


0.16064431659273198 0.1386668479868344
Epoch:  15


0.1583271272279121 0.1379670330456325
Epoch:  16


0.1593802945839392 0.1398868773664747
Epoch:  17


0.1591929741002418 0.13621073429073607
Epoch:  18


0.15868830398933306 0.1385618331176894
Epoch:  19


0.1576215250266565 0.13497389746563776
Epoch:  20


0.15682517435099627 0.13586260697671346
Epoch:  21


0.15686884119703964 0.13615880267960684
Epoch:  22


0.15637346377243866 0.13497605387653624
Epoch:  23


0.15599674590536067 0.1363680756517819
Epoch:  24


0.1560850385073069 0.13119711514030183
Epoch:  25


0.15586475101677147 0.13197744637727737
Epoch:  26


0.15668755608635979 0.13450242153235845
Epoch:  27


0.1545212502415116 0.13032832741737366
Epoch:  28


0.1540737486368901 0.13300838747194835
Epoch:  29


0.1542278939002269 0.1337333278996604
Epoch:  30


0.15272554714937467 0.13299084348337992
Epoch:  31


0.1534880967559041 0.12907197965042932
Epoch:  32


0.15333673559330604 0.131365744130952
Epoch:  33


0.15269242710358388 0.13424670164074218
Epoch:  34


0.15267331938485842 0.12886270135641098
Epoch:  35


0.15188620058265892 0.12879756625209535
Epoch:  36


0.15208775006436012 0.13573459642274038
Epoch:  37


0.15201943387856354 0.12829438490527018
Epoch:  38


0.15292284375912435 0.1277994236775807
Epoch:  39


0.15281247528823647 0.13814582569258554
Epoch:  40


0.15327265939197024 0.13289178482123784
Epoch:  41


0.15242297383579048 0.13089671730995178
Epoch:  42


0.15092302657462456 0.1336480017219271
Epoch:  43


0.1511889398097992 0.13130149777446473
Epoch:  44


0.15029695992534226 0.13182689888136728
Epoch    44: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  45


0.1492368099657265 0.1318518796137401
Epoch:  46


0.1486728203457755 0.12995753224406922
Epoch:  47


0.14919741169826403 0.12946621435029165
Epoch:  48


0.14871668735065977 0.12783058094126837
Epoch:  49


0.14871534181607737 0.1280783074242728
Epoch:  50


0.14816778374684825 0.12807426495211466
Epoch    50: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  51


0.14808840485843452 0.12820374433483397
Epoch:  52


0.14842247238030304 0.12805356298174178
Epoch:  53


0.14846152510191943 0.1281936764717102
Epoch:  54


0.1478756303722794 0.127935956631388
Epoch:  55


0.14815406622113408 0.12751743836062296
Epoch:  56


0.1470002117189201 0.1278559970004218
Epoch:  57


0.14737513017010045 0.12784958524363382
Epoch:  58


0.14947134216089505 0.12825249774115427
Epoch:  59


0.14751275890582316 0.12805386100496566
Epoch:  60


0.14839839290928197 0.12825553864240646
Epoch:  61


0.14697270739722895 0.1278171549950327
Epoch    61: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  62


0.14738066977745778 0.12755858365978515
Epoch:  63


0.14894481285198316 0.1278714888862201
Epoch:  64


0.14818320604594978 0.12804930870022094
Epoch:  65


0.14729765661664912 0.12786555077348435
Epoch:  66


0.14842629593771858 0.12767778017691203
Epoch:  67


0.14801997590709376 0.12765911860125406
Epoch    67: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  68


0.14830163322590492 0.12791086839778082
Epoch:  69


0.14733811568569494 0.12796053822551454
Epoch:  70


0.14790189024564382 0.12789021325962885
Epoch:  71


0.1477508045531608 0.1275919782263892
Epoch:  72


0.1497273437074713 0.127622552216053
Epoch:  73


0.14679895341396332 0.12799273218427384
Epoch    73: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  74


0.14712857193238027 0.12776494771242142
Epoch:  75


0.14780716960494583 0.12781900699649537
Epoch:  76


0.14803319565347722 0.12781577025141036
Epoch:  77


0.14763184093140266 0.12759152054786682
Epoch:  78


0.14821544128495293 0.1278692079441888
Epoch:  79


0.14807380977514628 0.12778465024062566
Epoch:  80
