In [1]:
# Parameters
until_x = 4


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]


def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()


class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 96, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 96, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 96, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
# Instantiate the model
model = Task5Model(31).to(device)

In [12]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [13]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 15:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6555387043952942 0.574974524974823
Epoch:  1


0.43865119576454165 0.28625338673591616
Epoch:  2


0.23351175129413604 0.1823095351457596
Epoch:  3


0.1880871534347534 0.1720949172973633
Epoch:  4


0.18105904698371889 0.16280291974544525
Epoch:  5


0.17972256243228912 0.16294392049312592
Epoch:  6


0.17792804718017577 0.16065895259380342
Epoch:  7


0.17754259943962097 0.16219891905784606
Epoch:  8


0.17451998233795166 0.16264471411705017
Epoch:  9


0.17470228493213655 0.1685888022184372
Epoch:  10


0.1740284150838852 0.16592200994491577
Epoch:  11


0.1729014164209366 0.16018728017807007
Epoch:  12


0.17050909757614136 0.15850078761577607
Epoch:  13


0.1695811605453491 0.15334199070930482
Epoch:  14


0.1696411269903183 0.15463015735149382
Epoch:  15


0.16782329559326173 0.15529852211475373
Epoch:  16


0.16526680946350097 0.1508413940668106
Epoch:  17


0.16450417160987854 0.14782410264015197
Epoch:  18


0.16561374068260193 0.1519037067890167
Epoch:  19


0.16183845698833466 0.14388738572597504
Epoch:  20


0.16118572890758515 0.1438793033361435
Epoch:  21


0.16081228196620942 0.14130840599536895
Epoch:  22


0.15960569977760314 0.14214895963668822
Epoch:  23


0.1594049435853958 0.13744998276233672
Epoch:  24


0.15954490065574645 0.1384872943162918
Epoch:  25


0.15756899416446685 0.13588146567344667
Epoch:  26


0.15856046140193938 0.13863179385662078
Epoch:  27


0.15651097059249877 0.13287854939699173
Epoch:  28


0.15678482234477997 0.13786294162273408
Epoch:  29


0.15440888524055482 0.13323192447423934
Epoch:  30


0.15526740670204162 0.13714341968297958
Epoch:  31


0.15543589651584624 0.13311466723680496
Epoch:  32


0.15537630558013915 0.13154849261045456
Epoch:  33


0.15393697202205658 0.1335686281323433
Epoch:  34


0.15356673538684845 0.13607880622148513
Epoch:  35


0.15268631219863893 0.13450867235660552
Epoch:  36


0.15357225120067597 0.13611415028572083
Epoch:  37


0.15227827787399292 0.13380102664232255
Epoch:  38


0.15391485452651976 0.1323420524597168
Epoch    38: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  39


0.1516735976934433 0.12802373468875886
Epoch:  40


0.15062072932720183 0.1268725112080574
Epoch:  41


0.15084159433841704 0.12689704746007918
Epoch:  42


0.15038825809955597 0.12667934000492095
Epoch:  43


0.15093173027038576 0.1265025854110718
Epoch:  44


0.1501025986671448 0.1268086090683937
Epoch:  45


0.14959422528743743 0.12663460224866868
Epoch:  46


0.1506584346294403 0.12686625868082047
Epoch:  47


0.14966565549373625 0.1265060931444168
Epoch:  48


0.1488501363992691 0.1264256000518799
Epoch:  49


0.14995029509067537 0.12640018463134767
Epoch:  50


0.15007811427116394 0.12640321105718613
Epoch:  51


0.15141891360282897 0.12618354558944703
Epoch:  52


0.14931290507316589 0.1260808914899826
Epoch:  53


0.14891324818134308 0.1263822630047798
Epoch:  54


0.1485522586107254 0.12621017694473266
Epoch:  55


0.14861168384552 0.12584279030561446
Epoch:  56


0.14829650580883025 0.1265521079301834
Epoch:  57


0.14910030007362365 0.12630395889282225
Epoch:  58


0.14850226283073426 0.12608342617750168
Epoch:  59


0.14986807525157927 0.12571426182985307
Epoch:  60


0.14888151109218598 0.1257103204727173
Epoch:  61


0.1486707931756973 0.12546509951353074
Epoch:  62


0.1475917172431946 0.12513071149587632
Epoch:  63


0.14985003530979157 0.1250137507915497
Epoch:  64


0.14729951024055482 0.125419782102108
Epoch:  65


0.14757428228855132 0.12524656355381011
Epoch:  66


0.14898690819740296 0.12564645409584047
Epoch:  67


0.14699553430080414 0.12505332231521607
Epoch:  68


0.14771196961402894 0.1250733718276024
Epoch:  69


0.14839308381080626 0.12605648636817932
Epoch    69: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  70


0.14770522475242615 0.12556925415992737
Epoch:  71


0.14817225337028503 0.12541245222091674
Epoch:  72


0.14733322858810424 0.1252913936972618
Epoch:  73


0.14774453043937683 0.12533404678106308
Epoch:  74


0.14720529854297637 0.12554006427526473
Epoch:  75


0.14753202080726624 0.12551654130220413
Epoch    75: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  76


0.14772405624389648 0.1255135104060173
Epoch:  77


0.14722005486488343 0.1253850817680359
Epoch:  78
