In [1]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [2]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [3]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [4]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [5]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [6]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [7]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [8]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [9]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [10]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [11]:
until_x = None

In [12]:
# Parameters
until_x = 16


In [13]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [14]:
# Instantiate the model
model = Task5Model(31).to(device)

In [15]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [16]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6317907539573876 0.43245313848767963
Epoch:  1


0.32483572452454956 0.2200691487107958
Epoch:  2


0.18970529654541532 0.17903885671070643
Epoch:  3


0.18145466777118477 0.17341041990688869
Epoch:  4


0.17910512999908343 0.16191782695906504
Epoch:  5


0.17668345691384496 0.17888758437974112
Epoch:  6


0.17581326454072385 0.17194758781364985
Epoch:  7


0.1751389616244548 0.15992915204593114
Epoch:  8


0.1717371614398183 0.15988172803606307
Epoch:  9


0.16985347303184303 0.16043801605701447
Epoch:  10


0.16959331567223007 0.15358695600714004
Epoch:  11


0.1678325956737673 0.16824485148702348
Epoch:  12


0.17013729464363408 0.15490674121039255
Epoch:  13


0.16887155457122907 0.1504935622215271
Epoch:  14


0.16879030256657987 0.1496695578098297
Epoch:  15


0.16696821354530952 0.16919606498309545
Epoch:  16


0.16751086389696276 0.15249655502183096
Epoch:  17


0.16498091333621256 0.14734183996915817
Epoch:  18


0.1652728899105175 0.14443124617849076
Epoch:  19


0.16454744661176526 0.14444851662431443
Epoch:  20


0.16460999604817983 0.14356731516974314
Epoch:  21


0.16334780245213895 0.14064712609563554
Epoch:  22


0.16181489422514633 0.14055883777993067
Epoch:  23


0.16322149537705086 0.14151258873088018
Epoch:  24


0.16223307437187917 0.14234386384487152
Epoch:  25


0.16246091795934214 0.13863323096718108
Epoch:  26


0.1613280414729505 0.1382728378687586
Epoch:  27


0.1606332619447966 0.14182430718626296
Epoch:  28


0.16044271153372688 0.13751341296093805
Epoch:  29


0.15974709029133255 0.14157306509358542
Epoch:  30


0.16024994205784154 0.14273539504834584
Epoch:  31


0.15934194261963303 0.1382740299616541
Epoch:  32


0.15776750445365906 0.1413008804832186
Epoch:  33


0.15833686936546015 0.13820983788796834
Epoch:  34


0.15989011687201424 0.1374376586505345
Epoch:  35


0.15849031749609355 0.13493711607796804
Epoch:  36


0.15744149684906006 0.1385354921221733
Epoch:  37


0.15794037644927567 0.14035142638853618
Epoch:  38


0.1570748135044768 0.1349337590592248
Epoch:  39


0.15778104034630028 0.13641363488776342
Epoch:  40


0.15682496372106913 0.1376873797604016
Epoch:  41


0.1554650100501808 0.13252180069684982
Epoch:  42


0.15656991182146845 0.13200057404381887
Epoch:  43


0.1568715946899878 0.13470693784100668
Epoch:  44


0.15629931639980627 0.13302846997976303
Epoch:  45


0.15540938039083738 0.13318182528018951
Epoch:  46


0.1543276463005994 0.13778812544686453
Epoch:  47


0.15530013152070948 0.13275898673704692
Epoch:  48


0.1547817676453977 0.1322890062417303
Epoch    48: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  49


0.15286417184649287 0.1295291047011103
Epoch:  50


0.15333659705278035 0.1290488179240908
Epoch:  51


0.15189683115160144 0.12858748648847854
Epoch:  52


0.15168238290258357 0.12903808589492524
Epoch:  53


0.15251809318323392 0.1318264060786792
Epoch:  54


0.15164095847993284 0.12871078827551433
Epoch:  55


0.15173084268698822 0.1295951743211065
Epoch:  56


0.15253060210395503 0.1352967470884323
Epoch:  57


0.15172347627781532 0.1321986817887851
Epoch    57: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  58


0.15175586616670764 0.12883547906364715
Epoch:  59


0.15206492067994298 0.12898549224649156
Epoch:  60


0.15015024309222763 0.12930317223072052
Epoch:  61


0.15178134312500824 0.12841181882790156
Epoch:  62


0.15187998720117518 0.1289405216063772
Epoch:  63


0.15167388843523488 0.12802030678306306
Epoch:  64


0.15134224134522514 0.12970496820552008
Epoch:  65


0.15132489518539324 0.12794154456683568
Epoch:  66


0.15274942082327767 0.13114909189088003
Epoch:  67


0.15142340837298213 0.13353575659649713
Epoch:  68


0.15152327071976018 0.13794597025428498
Epoch:  69


0.15210378210286837 0.12874504604509898
Epoch:  70


0.15314005798584707 0.1289458210979189
Epoch:  71


0.15135864591276324 0.1287166965859277
Epoch    71: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  72


0.1521179241103095 0.1337487899831363
Epoch:  73


0.15011544606170138 0.12921621650457382
Epoch:  74


0.15133659098599408 0.13039002141782216
Epoch:  75


0.15226040015349518 0.12802879512310028
Epoch:  76


0.15196954237448201 0.12810713265623366
Epoch:  77


0.15289062060214378 0.1282537505030632
Epoch    77: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  78


0.15159664202380824 0.1286518605692046
Epoch:  79


0.15058517214414235 0.1382140155349459
Epoch:  80


0.15048676084827733 0.12829102895089559
Epoch:  81


0.1513503242183376 0.1309524444597108
Epoch:  82


0.15163558640995542 0.1293841717498643
Epoch:  83


0.1520825515727739 0.12812889793089458
Epoch    83: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  84


0.15140868320658402 0.12810939124652318
Epoch:  85


0.15232059037363208 0.12855038259710586
Epoch:  86


0.15227922676382838 0.1281075371163232
Epoch:  87


0.15183952370205442 0.12932647977556502
Epoch:  88


0.15084432951501897 0.1284666284918785
Epoch:  89


0.15085261697704727 0.1289283846105848
Epoch:  90
