In [1]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor



In [2]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [3]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [4]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [5]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [6]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [7]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [8]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [9]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [10]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [11]:
until_x = None

In [12]:
# Parameters
until_x = 3


In [13]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [14]:
# Instantiate the model
model = Task5Model(31).to(device)

In [15]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [16]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6315471207773363 0.47936310938426424
Epoch:  1


0.32250238270372955 0.18513525596686772
Epoch:  2


0.18472299704680573 0.23484868449824198
Epoch:  3


0.1720720924235679 0.160383083990642
Epoch:  4


0.16664397112421087 0.14674259722232819
Epoch:  5


0.1635098533855902 0.14668135983603342
Epoch:  6


0.1620991826057434 0.1393607876130513
Epoch:  7


0.16115018242114298 0.1383382167134966
Epoch:  8


0.16097409580204938 0.13525553792715073
Epoch:  9


0.15863936575683388 0.14006185212305614
Epoch:  10


0.15820323212726697 0.14197775721549988
Epoch:  11


0.15772654720254847 0.13286649967942918
Epoch:  12


0.15576544041569168 0.12970537372997828
Epoch:  13


0.15517314221407916 0.13132107257843018
Epoch:  14


0.15598988291379567 0.13472597301006317
Epoch:  15


0.15417091830356702 0.13943455048969813
Epoch:  16


0.15461450452740128 0.13110159231083734
Epoch:  17


0.1556617903548318 0.1330501139163971
Epoch:  18


0.15576031965178413 0.13221109126295363
Epoch    18: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  19


0.15353724682653272 0.1265937015414238
Epoch:  20


0.15218301238240423 0.12557559247527803
Epoch:  21


0.15232731321373502 0.12530417740345
Epoch:  22


0.15168064309133067 0.1253590349640165
Epoch:  23


0.15127711078605136 0.12477627928767886
Epoch:  24


0.1500018831040408 0.12473777255841664
Epoch:  25


0.1502995752804988 0.12449148297309875
Epoch:  26


0.1510246464529553 0.1242020960365023
Epoch:  27


0.1504897294012276 0.124173461326531
Epoch:  28


0.15109093849723404 0.12303932117564338
Epoch:  29


0.15060586260782705 0.12430682139737266
Epoch:  30


0.1497261737649505 0.12411981927497047
Epoch:  31


0.14978280744037112 0.12359980600220817
Epoch:  32


0.14885212158834613 0.12349898368120193
Epoch:  33


0.14988906681537628 0.12364469681467329
Epoch:  34


0.14835441877713074 0.12358120935303825
Epoch    34: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  35


0.14903361853715535 0.12342904401676995
Epoch:  36


0.1487799262678301 0.12325373398406166
Epoch:  37


0.14859994521012176 0.12309654589210238
Epoch:  38


0.1488758249057306 0.12299436650105885
Epoch:  39


0.14974630120638255 0.12292736130101341
Epoch:  40


0.14815777419386683 0.12299920937844686
Epoch:  41


0.14885397899795222 0.1234360858798027
Epoch:  42


0.14986797321487116 0.12332097015210561
Epoch:  43


0.14817462136616577 0.12293838070971626
Epoch:  44


0.14839867119853561 0.12309275886842183
Epoch:  45


0.14741266861155228 0.12298991637570518
Epoch    45: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  46


0.14954262005316243 0.1225339344569615
Epoch:  47


0.14872891194111593 0.12254819593259267
Epoch:  48


0.14900213036988233 0.12283940400396075
Epoch:  49


0.14757781657012733 0.12278082753930773
Epoch:  50


0.14742519807171178 0.12290876039436885
Epoch:  51


0.14895913246515635 0.1229035375373704
Epoch:  52


0.14841098720962936 0.12248657324484416
Epoch:  53


0.14841029652067134 0.12285506725311279
Epoch:  54


0.14924955649955854 0.12318142290626254
Epoch:  55


0.14818444324506297 0.122528218797275
Epoch:  56


0.14815984948261365 0.12267245671578816
Epoch:  57


0.1476053974112949 0.12291533925703593
Epoch:  58


0.14842242566314903 0.12293169221707753
Epoch    58: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  59


0.14825232447804632 0.12272596252816063
Epoch:  60


0.14952533148430489 0.12279379474265235
Epoch:  61


0.14786820073385495 0.12276344107730049
Epoch:  62


0.14753446909221443 0.12267696005957467
Epoch:  63


0.1496200662206959 0.12332265717642647
Epoch:  64


0.14924373296467033 0.12266389386994499
Epoch    64: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  65


0.14847398609728427 0.12274010585887092
Epoch:  66


0.14886330833306183 0.12262660903590065
Epoch:  67


0.14836062249299642 0.12319169619253703
Epoch:  68


0.14902763874144168 0.1224847776549203
Epoch:  69


0.14861869570371267 0.12293295455830437
Epoch:  70


0.14683422808711594 0.12299180775880814
Epoch:  71


0.14847651808648496 0.12221301985638482
Epoch:  72


0.14955703028150508 0.12270707424197878
Epoch:  73


0.14807602682629148 0.12296961780105319
Epoch:  74


0.14901214878301364 0.12304582340376717
Epoch:  75


0.14754181737835342 0.1227899215051106
Epoch:  76


0.14901631989994565 0.12272859471184867
Epoch:  77


0.14913531453222842 0.12271769238369805
Epoch:  78


0.14727150306508346 0.12307485299451011
Epoch:  79


0.14831630885601044 0.12249745322125298
Epoch:  80


0.1486696258590028 0.1228047449673925
Epoch:  81


0.1485859340912587 0.12253353531871523
Epoch:  82


0.14802437416605047 0.12292517508779253
Epoch:  83


0.1481337112349433 0.12263675459793635
Epoch:  84


0.14926390430411776 0.12261320437703814
Epoch:  85


0.14976315240602237 0.12293571872370583
Epoch:  86


0.148498399837597 0.12278648785182408
Epoch:  87


0.14839715369649836 0.12316246969359261
Epoch:  88


0.14842380905473554 0.12284914617027555
Epoch:  89


0.14840971537538478 0.12268019999776568
Epoch:  90


0.14809037060350985 0.12274941482714244
Epoch:  91


0.1483412191674516 0.12287082416670662
Epoch:  92


0.14837969517385638 0.12295419403484889
Epoch:  93


0.14875827931069038 0.12351671606302261
Epoch:  94


0.14979190480064702 0.12317091013704028
Epoch:  95


0.1489857564101348 0.12256659780229841
Epoch:  96
