In [1]:
# Parameters
until_x = 10


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [12]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [13]:
# Instantiate the model
model = Task5Model(31).to(device)

In [14]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6058214057136226 0.39766914929662434
Epoch:  1


0.27582526448610667 0.17372683329241617
Epoch:  2


0.17504923971923622 0.14645096021039145
Epoch:  3


0.16638284478638624 0.1538801853145872
Epoch:  4


0.16246614343411214 0.14518469784940993
Epoch:  5


0.1606309534730138 0.13678774344069616
Epoch:  6


0.1585890634639843 0.1311614896569933
Epoch:  7


0.1573760267850515 0.1308384154524122
Epoch:  8


0.15667837133278717 0.1321984295334135
Epoch:  9


0.15489304911445928 0.1312796590583665
Epoch:  10


0.15464666928793933 0.13155065583331244
Epoch:  11


0.15361255689247236 0.1304546988436154
Epoch:  12


0.1532453431470974 0.13401560485363007
Epoch:  13


0.15352452123487317 0.1468870916536876
Epoch:  14


0.1526420277518195 0.12679616787603923
Epoch:  15


0.1523694955819362 0.13238188305071422
Epoch:  16


0.15108957161774506 0.13012335555894033
Epoch:  17


0.15105279071910963 0.12966166117361613
Epoch:  18


0.14999450461284533 0.12895611886467254
Epoch:  19


0.1513855964750857 0.12875249981880188
Epoch:  20


0.14921460159727046 0.1272869503923825
Epoch    20: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  21


0.14864835505549973 0.1241991754089083
Epoch:  22


0.14859269962117477 0.12348470091819763
Epoch:  23


0.14771832525730133 0.12340334909302848
Epoch:  24


0.1474954272444184 0.1232686915567943
Epoch:  25


0.14670257149515925 0.1228499465755054
Epoch:  26


0.14596030277174873 0.12268199665205819
Epoch:  27


0.14687786875544367 0.12267663862024035
Epoch:  28


0.14591234479401563 0.12260352820158005
Epoch:  29


0.14592946622822736 0.12232051470449992
Epoch:  30


0.1456530871423515 0.12284996999161583
Epoch:  31


0.14589252866603233 0.12268040754965373
Epoch:  32


0.14590590628417763 0.123115031846932
Epoch:  33


0.14462587801185814 0.1229289738195283
Epoch:  34


0.14661251813978762 0.12226231183324542
Epoch:  35


0.14601322686350024 0.1225387869136674
Epoch:  36


0.14507499216376124 0.12184394683156695
Epoch:  37


0.14571979118360057 0.12374023348093033
Epoch:  38


0.14451653812382673 0.12240463814565114
Epoch:  39


0.14639200632636612 0.12179666012525558
Epoch:  40


0.14585947627956802 0.1227903174502509
Epoch:  41


0.14538583884368073 0.12282942022596087
Epoch:  42


0.14435866152918017 0.1228050919515746
Epoch:  43


0.14427572367964564 0.12241472942488534
Epoch:  44


0.14481093996279948 0.12292380630970001
Epoch:  45


0.14368561312959 0.12233485600778035
Epoch    45: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  46


0.1434338866053401 0.1222087677036013
Epoch:  47


0.14460646179882256 0.12205461093357631
Epoch:  48


0.14427106485173508 0.12206330043928963
Epoch:  49


0.14528450692022168 0.12198832737548011
Epoch:  50


0.14407550523409973 0.12210727695907865
Epoch:  51


0.142435988058915 0.1223141114626612
Epoch    51: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  52


0.1442799537971213 0.12210807204246521
Epoch:  53


0.14569804716754603 0.12224539688655309
Epoch:  54


0.1435242790628124 0.12240802070924214
Epoch:  55


0.14384902449878487 0.12226819885628563
Epoch:  56


0.1432459169142955 0.12203850490706307
Epoch:  57


0.14370539864978274 0.12224792156900678
Epoch    57: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  58


0.14364932638567848 0.12217272498777934
Epoch:  59


0.1437103933579213 0.12215023700680051
Epoch:  60


0.14480737053059242 0.12215474567243031
Epoch:  61


0.14283185512632937 0.12212426108973366
Epoch:  62


0.1440981541936462 0.1225692800113133
Epoch:  63


0.14464801872098768 0.12221800216606685
Epoch    63: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  64
