In [1]:
# Parameters
until_x = 5


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [12]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [13]:
# Instantiate the model
model = Task5Model(31).to(device)

In [14]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6062513638187099 0.4900413623877934
Epoch:  1


0.27525847989159663 0.19079588566507613
Epoch:  2


0.1819811411000587 0.17061117078576768
Epoch:  3


0.1737574093245171 0.16205972007342748
Epoch:  4


0.16986543060959997 0.15406383999756404
Epoch:  5


0.16810749511461 0.14684718421527318
Epoch:  6


0.1660460036348652 0.14712413400411606
Epoch:  7


0.1641076078286042 0.1413962117263249
Epoch:  8


0.16269929183496012 0.14361762468303954
Epoch:  9


0.1618923987891223 0.14135199998106276
Epoch:  10


0.16237280054672346 0.1371690258383751
Epoch:  11


0.15921994641020493 0.13598100521734782
Epoch:  12


0.15925397381589218 0.13703881204128265
Epoch:  13


0.1588078862106478 0.13723260377134597
Epoch:  14


0.15761568014686173 0.13337821513414383
Epoch:  15


0.1568061693294628 0.13423427194356918
Epoch:  16


0.15768881987881017 0.1344454139471054
Epoch:  17


0.15600851057349024 0.13250591393027986
Epoch:  18


0.1561938137621493 0.1314196895275797
Epoch:  19


0.15620655022762917 0.13278581414903914
Epoch:  20


0.15539824076600978 0.13279352762869426
Epoch:  21


0.15542801449427734 0.13351206481456757
Epoch:  22


0.15591775968268112 0.13748690911701747
Epoch:  23


0.15338745109132818 0.13116043486765452
Epoch:  24


0.1536018127525175 0.12860959874732153
Epoch:  25


0.1525868660694844 0.12931784135954721
Epoch:  26


0.15285685416814443 0.12991050417934144
Epoch:  27


0.1538604139476209 0.14335511199065618
Epoch:  28


0.15198480639908765 0.13299522229603358
Epoch:  29


0.1528544647468103 0.12957462987729482
Epoch:  30


0.1538584240385004 0.13990051937954767
Epoch    30: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  31


0.15153264757749196 0.12603249720164708
Epoch:  32


0.15084569397810343 0.12488306526626859
Epoch:  33


0.15076543592117927 0.12480914699179786
Epoch:  34


0.14877333310810295 0.12485882639884949
Epoch:  35


0.14856537129427935 0.12422019456114088
Epoch:  36


0.14925291610730662 0.12506375355379923
Epoch:  37


0.15048409031855092 0.12404601275920868
Epoch:  38


0.149597350004557 0.12508848948138102
Epoch:  39


0.14948377456213977 0.12461599814040321
Epoch:  40


0.1478988028861381 0.12432268049035754
Epoch:  41


0.14896510017884745 0.12385711286749158
Epoch:  42


0.14826441817992442 0.12441504214491163
Epoch:  43


0.14784404192421888 0.12410652424607958
Epoch:  44


0.14710560320196925 0.12427932556186404
Epoch:  45


0.14893191448740056 0.12425485146897179
Epoch:  46


0.14903566643998428 0.12416164044822965
Epoch:  47


0.1483659430130108 0.12439297352518354
Epoch    47: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  48


0.14815076057975357 0.12444328410284859
Epoch:  49


0.14797940809984464 0.1237149430172784
Epoch:  50


0.1481483127619769 0.12411253899335861
Epoch:  51


0.1472733568500828 0.12408156905855451
Epoch:  52


0.14732266761161186 0.12410839753491539
Epoch:  53


0.14878355852655462 0.12412755404199872
Epoch:  54


0.14800534538320592 0.12420083050216947
Epoch:  55


0.14830755462517609 0.12393826672009059
Epoch    55: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  56


0.14687624493160764 0.12390339055231639
Epoch:  57


0.14887233078479767 0.12411749469382423
Epoch:  58


0.1480764440588049 0.12379847360508782
Epoch:  59


0.1485346056319572 0.12404033541679382
Epoch:  60


0.14802252883846695 0.12379459717444011
Epoch:  61


0.14810095767717105 0.12370132335594722
Epoch:  62


0.14770086351278666 0.12411219945975713
Epoch:  63


0.147964745759964 0.12392478968415942
Epoch:  64


0.14830666861018618 0.1238824086529868
Epoch:  65


0.14802823115039515 0.1241459601691791
Epoch:  66


0.14760736396183838 0.123805114201137
Epoch:  67


0.1482788739172188 0.12360685957329613
Epoch:  68


0.14679173078085925 0.12428939874683108
Epoch:  69


0.14780683211378148 0.12353982137782234
Epoch:  70


0.14792166529475032 0.12433047486203057
Epoch:  71


0.14828725924363007 0.12370646532092776
Epoch:  72


0.14723399362048586 0.12406228589160102
Epoch:  73


0.14728341795302727 0.12492547290665763
Epoch:  74


0.14773171052739426 0.12384289396660668
Epoch:  75


0.14868593457582835 0.1241621630532401
Epoch    75: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  76


0.1468021068218592 0.12393233286482948
Epoch:  77


0.14836141144907153 0.12407293702874865
Epoch:  78


0.14752586024838524 0.12428145110607147
Epoch:  79


0.14796051342745084 0.1243075111082622
Epoch:  80


0.14842710986330704 0.12386221651520048
Epoch:  81


0.14699873932310054 0.1241847180894443
Epoch    81: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  82


0.1481901068945189 0.1242944672703743
Epoch:  83


0.14822668763431343 0.12375005973236901
Epoch:  84


0.14715101348387227 0.12355281306164605
Epoch:  85


0.14829702272608475 0.12385125564677375
Epoch:  86


0.14935220214160713 0.12403173851115364
Epoch:  87


0.14751359618998863 0.123932597892625
Epoch:  88


0.14743657571238442 0.12352247961929866
Epoch:  89


0.1481515749080761 0.12383202037640981
Epoch:  90


0.14852057276545344 0.12357252197606224
Epoch:  91


0.14708823569723078 0.12441146160875048
Epoch:  92


0.1484313357520748 0.12433407987867083
Epoch:  93


0.14698979540451154 0.12365623882838658
Epoch:  94


0.14844002957279617 0.12386801945311683
Epoch:  95


0.14795771643922134 0.1245886396084513
Epoch:  96


0.14785978761879173 0.1243106062923159
Epoch:  97


0.14791743739231214 0.12382325636489051
Epoch:  98


0.14769851839220202 0.1242573293192046
Epoch:  99


0.14711804526883202 0.1241454120193209
