In [1]:
# Parameters
until_x = 7


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]


def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()


class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 96, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 96, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 96, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
# Instantiate the model
model = Task5Model(31).to(device)

In [12]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [13]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 15:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.654174485206604 0.5872676372528076
Epoch:  1


0.4353247284889221 0.2997423350811005
Epoch:  2


0.2240542858839035 0.1817444920539856
Epoch:  3


0.17697773337364198 0.16160567998886108
Epoch:  4


0.16715034127235412 0.14936913847923278
Epoch:  5


0.16460008442401886 0.17382918000221254
Epoch:  6


0.16047156631946563 0.14727220833301544
Epoch:  7


0.15997213542461394 0.14197933673858643
Epoch:  8


0.15945943236351012 0.1365563839673996
Epoch:  9


0.15677172303199768 0.1344014972448349
Epoch:  10


0.1566392648220062 0.13807913213968276
Epoch:  11


0.15598105788230895 0.14232993721961976
Epoch:  12


0.1558534872531891 0.133800907433033
Epoch:  13


0.15420453011989593 0.1318406954407692
Epoch:  14


0.1539670079946518 0.13491858243942262
Epoch:  15


0.1531527256965637 0.13071779012680054
Epoch:  16


0.15254899203777314 0.13730012476444245
Epoch:  17


0.15274105489253997 0.13100600093603135
Epoch:  18


0.15122908771038054 0.12937643080949784
Epoch:  19


0.1512981164455414 0.1358096942305565
Epoch:  20


0.15288926124572755 0.14111704230308533
Epoch:  21


0.15166754126548768 0.12842073291540146
Epoch:  22


0.15019711077213288 0.12799427956342696
Epoch:  23


0.1505524444580078 0.12914072275161742
Epoch:  24


0.15006860673427583 0.13088585287332535
Epoch:  25


0.150650075674057 0.13529269397258759
Epoch:  26


0.14949273705482483 0.13189567178487777
Epoch:  27


0.14856204748153687 0.12808500975370407
Epoch:  28


0.14922637581825257 0.13219046890735625
Epoch    28: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  29


0.14734383344650268 0.12500039339065552
Epoch:  30


0.14614680051803589 0.1250358209013939
Epoch:  31


0.14755022823810576 0.1244408756494522
Epoch:  32


0.14506401598453522 0.12373645901679993
Epoch:  33


0.14582574546337126 0.12395149320363999
Epoch:  34


0.14591784834861754 0.1240038588643074
Epoch:  35


0.1440974736213684 0.12344696819782257
Epoch:  36


0.14570289850234985 0.12363161146640778
Epoch:  37


0.1462504404783249 0.12419212013483047
Epoch:  38


0.1456063497066498 0.12428818643093109
Epoch:  39


0.14522553861141205 0.12414292246103287
Epoch:  40


0.1455353754758835 0.1238433688879013
Epoch:  41


0.1447381579875946 0.12433985322713852
Epoch    41: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  42


0.14654158532619477 0.12406449764966965
Epoch:  43


0.1447795331478119 0.12375544160604476
Epoch:  44


0.14415320098400117 0.1234275221824646
Epoch:  45


0.14475923001766206 0.12358072847127914
Epoch:  46


0.14532083988189698 0.12365924119949341
Epoch:  47


0.1448271715641022 0.12349953651428222
Epoch:  48


0.14422512888908387 0.12403195202350617
Epoch:  49


0.14428379476070405 0.1234430804848671
Epoch:  50


0.14511980652809142 0.12371373474597931
Epoch    50: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  51


0.14402074575424195 0.12358058393001556
Epoch:  52


0.1447181248664856 0.12338947504758835
Epoch:  53


0.14541371643543244 0.12363799810409545
Epoch:  54


0.14439815938472747 0.1233740657567978
Epoch:  55


0.14475427985191344 0.12310791462659836
Epoch:  56


0.14442785501480102 0.12337640076875686
Epoch:  57


0.14450110673904418 0.12372643053531647
Epoch:  58


0.1442873579263687 0.12365712821483613
Epoch:  59


0.14479953289031983 0.1234547883272171
Epoch:  60


0.14544550240039825 0.12329604178667068
Epoch:  61


0.14509072184562682 0.12341722846031189
Epoch    61: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  62


0.14477682650089263 0.12357177287340164
Epoch:  63


0.1472923243045807 0.12364326119422912
Epoch:  64


0.14447934925556183 0.12352858036756516
Epoch:  65


0.14376902997493743 0.12373463064432144
Epoch:  66


0.14400776386260986 0.12366198599338532
Epoch:  67


0.1452851915359497 0.12351720333099366
Epoch    67: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  68


0.1455726832151413 0.12344907075166703
Epoch:  69


0.14426711976528167 0.12346470952033997
Epoch:  70
