In [1]:
# Parameters
until_x = 18


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [12]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [13]:
# Instantiate the model
model = Task5Model(31).to(device)

In [14]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6091123674366925 0.42206006390707834
Epoch:  1


0.2784546831169644 0.17853299421923502
Epoch:  2


0.1694773826244715 0.14122970082930156
Epoch:  3


0.1606108167686978 0.13792302353041513
Epoch:  4


0.15820287409666423 0.13102442451885768
Epoch:  5


0.15567815545442942 0.13210008719137736
Epoch:  6


0.1538731463857599 0.138004475406238
Epoch:  7


0.1533502183250479 0.12850008585623332
Epoch:  8


0.1512737539974419 0.1339869424700737
Epoch:  9


0.15233288785895785 0.12778688754354203
Epoch:  10


0.15055273151075518 0.13016105656112945
Epoch:  11


0.15247906583386497 0.12907078755753382
Epoch:  12


0.15209804394760648 0.12742727249860764
Epoch:  13


0.1506835093369355 0.12991850184542791
Epoch:  14


0.15003541192492922 0.12930199929646083
Epoch:  15


0.14967529958969839 0.12736587119953974
Epoch:  16


0.14810096089904373 0.12804180809429713
Epoch:  17


0.14848885987256025 0.12885896861553192
Epoch:  18


0.14724261051899679 0.1280527487397194
Epoch:  19


0.14759486672040578 0.12610159601484025
Epoch:  20


0.14791469839779106 0.12513752707413264
Epoch:  21


0.14640279597527273 0.1254887995975358
Epoch:  22


0.14689147915389086 0.12392163489546094
Epoch:  23


0.14659541321767344 0.12508103251457214
Epoch:  24


0.14613173700667717 0.12692712673119136
Epoch:  25


0.14607611338834506 0.1216415890625545
Epoch:  26


0.14565928844181267 0.12823087509189332
Epoch:  27


0.1470203750036858 0.1263589933514595
Epoch:  28


0.14620073864588867 0.12476353347301483
Epoch:  29


0.1450944997974344 0.12328320954527174
Epoch:  30


0.14626145745451385 0.12351778149604797
Epoch:  31


0.14560071882363912 0.12560153752565384
Epoch    31: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  32


0.14383953366730665 0.12055869081190654
Epoch:  33


0.14132696148511525 0.11987932239259992
Epoch:  34


0.14144766089078542 0.12018589462552752
Epoch:  35


0.14255155300771868 0.1204193766627993
Epoch:  36


0.14137569996150764 0.12005328280585152
Epoch:  37


0.14095784602938471 0.12041447737387248
Epoch:  38


0.14103659745809194 0.12047467061451503
Epoch:  39


0.14113562594394427 0.11965119412967137
Epoch:  40


0.14172210806124919 0.12016414318765913
Epoch:  41


0.14098788193754247 0.12039896100759506
Epoch:  42


0.13829168274595932 0.1194138133100101
Epoch:  43


0.13929838425404317 0.12006742613656181
Epoch:  44


0.139667179133441 0.1203614952308791
Epoch:  45


0.1400297541876097 0.12007675107036318
Epoch:  46


0.13945921147997314 0.12059996277093887
Epoch:  47


0.13966579594322154 0.12030349778277534
Epoch:  48


0.13874656566091487 0.12018691854817527
Epoch    48: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  49


0.13877557218074799 0.11982728647334236
Epoch:  50


0.13822759325439865 0.12014093143599373
Epoch:  51


0.13942741139515027 0.1201291201370103
Epoch:  52


0.13857331227611852 0.12013190452541624
Epoch:  53


0.13837665440262975 0.12008506591830935
Epoch:  54


0.13906676221538233 0.1200723797082901
Epoch    54: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  55


0.13967763854039683 0.1201942435332707
Epoch:  56


0.13961324055452604 0.12009424184049879
Epoch:  57


0.13852912791677424 0.12010313783373151
Epoch:  58


0.13966151689355438 0.12016575357743672
Epoch:  59


0.13789038642032728 0.12001126578875951
Epoch:  60


0.1386279826228683 0.1201012794460569
Epoch    60: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  61


0.1397993322965261 0.12018460141760963
Epoch:  62


0.13886377779213158 0.12014308997562953
Epoch:  63


0.1385701610832601 0.12003656157425471
Epoch:  64


0.13951487718401728 0.11997222048895699
Epoch:  65


0.13807589701704076 0.12030015779393059
Epoch:  66


0.1393855393335626 0.1202930041721889
Epoch    66: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  67
