In [1]:
# Parameters
until_x = 10


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [12]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [13]:
# Instantiate the model
model = Task5Model(31).to(device)

In [14]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6034075118399955 0.4904918202332088
Epoch:  1


0.27396204181619593 0.19594988439764296
Epoch:  2


0.17610188996469653 0.15872664962496078
Epoch:  3


0.1664423527749809 0.14975476264953613
Epoch:  4


0.16283282758416356 0.15033998446805136
Epoch:  5


0.15999006258474813 0.13822143312011445
Epoch:  6


0.15823419190741875 0.1469348520040512
Epoch:  7


0.15573985431645368 0.13977873431784765
Epoch:  8


0.15649748090151194 0.13448164292744227
Epoch:  9


0.15559797069510897 0.1331207411629813
Epoch:  10


0.15331121433425593 0.13336067220994405
Epoch:  11


0.15232278607987068 0.13223006682736532
Epoch:  12


0.15308159146759961 0.127142248409135
Epoch:  13


0.1513916137250694 0.1267046577164105
Epoch:  14


0.151828333735466 0.1309889874288014
Epoch:  15


0.1521579450046694 0.1298377588391304
Epoch:  16


0.15157130320329923 0.1254952773451805
Epoch:  17


0.150341165226859 0.12882722594908305
Epoch:  18


0.15009998832200025 0.12654457241296768
Epoch:  19


0.15045201174310735 0.12649460030453546
Epoch:  20


0.14845689647906535 0.1291630736419133
Epoch:  21


0.149768567568547 0.14080878240721567
Epoch:  22


0.15001197642571218 0.12929601115839823
Epoch    22: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  23


0.14781501003213832 0.12418542163712638
Epoch:  24


0.1460143114263947 0.12267634911196572
Epoch:  25


0.14627756500566327 0.1224986846957888
Epoch:  26


0.14689179852202133 0.12243954305137907
Epoch:  27


0.14764407679841324 0.12231126321213585
Epoch:  28


0.14541841842032768 0.12232438368456704
Epoch:  29


0.1440736011073396 0.1219755049262728
Epoch:  30


0.14555658077871478 0.12159929424524307
Epoch:  31


0.1449345318046776 0.12243754416704178
Epoch:  32


0.1442907094149976 0.12192881107330322
Epoch:  33


0.14498486108071096 0.12219478615692683
Epoch:  34


0.14470648161462835 0.12174001442534584
Epoch:  35


0.14629811169327916 0.12170450602258955
Epoch:  36


0.1453593196095647 0.1217783774648394
Epoch    36: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  37


0.1449601154875111 0.1216093237910952
Epoch:  38


0.14477418282547513 0.12142984781946455
Epoch:  39


0.1440512441300057 0.12109001513038363
Epoch:  40


0.14427703900917158 0.12122684610741478
Epoch:  41


0.14376781558668292 0.12153765665633338
Epoch:  42


0.14468731992953532 0.12133294131074633
Epoch:  43


0.14365141254824562 0.1213445354785238
Epoch:  44


0.14515752002999588 0.12144840082951955
Epoch:  45


0.14434190456931656 0.12146573833056859
Epoch    45: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  46


0.14510372240801114 0.12133245915174484
Epoch:  47


0.14392629950433164 0.1215544193983078
Epoch:  48


0.14278756686159083 0.12143669916050774
Epoch:  49


0.14364219195133932 0.12154450693300792
Epoch:  50


0.14455172782008713 0.12133537871497017
Epoch:  51


0.1432425343507045 0.12126031517982483
Epoch    51: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  52


0.14502773413787018 0.12135249376296997
Epoch:  53


0.14454387490813797 0.12134232584919248
Epoch:  54


0.14419411847720276 0.12132457324436732
Epoch:  55


0.14448104456469818 0.12127372728926795
Epoch:  56


0.14354115321829514 0.1214109520827021
Epoch:  57


0.14315194012345495 0.12122018528836113
Epoch    57: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  58


0.1450071894639247 0.1211835869721004
Epoch:  59


0.14399454118432226 0.12135633400508336
Epoch:  60


0.14330428677636223 0.12114333467824119
Epoch:  61


0.14474616985063296 0.12136897444725037
Epoch:  62


0.14480881835963275 0.1214574511562075
Epoch:  63


0.14477152921058037 0.12143178709915706
Epoch:  64
