In [1]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor



In [2]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [3]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [4]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [5]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [6]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [7]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [8]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [9]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [10]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [11]:
until_x = None

In [12]:
# Parameters
until_x = 16


In [13]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [14]:
# Instantiate the model
model = Task5Model(31).to(device)

In [15]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [16]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.628537608159555 0.6331470779010228
Epoch:  1


0.32345347388370616 0.20044443224157607
Epoch:  2


0.19229932250203313 0.17100583016872406
Epoch:  3


0.18202437782609784 0.1685215745653425
Epoch:  4


0.17978037007757136 0.17154763638973236
Epoch:  5


0.17712744466356328 0.16520899534225464
Epoch:  6


0.17684231859606667 0.1596662210566657
Epoch:  7


0.17487399159251032 0.26558476260730196
Epoch:  8


0.17395901357805407 0.16015166257108962
Epoch:  9


0.1753914372341053 0.15722871039594924
Epoch:  10


0.17330499114216985 0.16454301561628068
Epoch:  11


0.1728972917472994 52.513192313058035
Epoch:  12


0.1719907405408653 0.15995913531099046
Epoch:  13


0.170701419179504 0.15269733113901957
Epoch:  14


0.170471135426212 0.1635808518954686
Epoch:  15


0.16975596144392685 0.15164549861635482
Epoch:  16


0.16771436261164174 0.14989733057362692
Epoch:  17


0.16910727201281367 0.15731593327862875
Epoch:  18


0.1695038905820331 0.1513854946408953
Epoch:  19


0.16924282868166227 0.16781000367232732
Epoch:  20


0.1670187514375996 0.14519090524741582
Epoch:  21


0.16614337587678754 0.15454459403242385
Epoch:  22


0.164963904667545 0.1453924115215029
Epoch:  23


0.1640959180690147 0.14238186925649643
Epoch:  24


0.1641503818131782 0.15034358203411102
Epoch:  25


0.1648188390441843 0.14474759783063615
Epoch:  26


0.16329765360097628 0.14428107014724187
Epoch:  27


0.1626699876946372 0.14678125509193965
Epoch:  28


0.1632931168014939 0.15644410039697373
Epoch:  29


0.16274741412820043 0.14151981792279653
Epoch:  30


0.16093739343656077 0.13995794100420816
Epoch:  31


0.16281311495884046 0.16727374494075775
Epoch:  32


0.16054873490655744 0.1374944363321577
Epoch:  33


0.16011520334192225 0.1376710193497794
Epoch:  34


0.15972162219318184 0.13681352457829885
Epoch:  35


0.1594402004738112 0.14947482624224254
Epoch:  36


0.15860996133572347 0.13617367510284697
Epoch:  37


0.15941122657543905 0.13842319697141647
Epoch:  38


0.15885049789338498 0.13677304238080978
Epoch:  39


0.1608950161450618 0.1434918769768306
Epoch:  40


0.15986532818626714 0.13892999504293715
Epoch:  41


0.16003549260062142 0.1360506330217634
Epoch:  42


0.15737723498731046 0.13659447325127466
Epoch:  43


0.15772761444787722 0.134752556681633
Epoch:  44


0.15754889435059316 0.13312926249844687
Epoch:  45


0.1581234219106468 0.13629779432501113
Epoch:  46


0.15813675442257444 0.13502101280859538
Epoch:  47


0.15724113623838168 0.13399064860173635
Epoch:  48


0.15581652240173235 0.13232736395938055
Epoch:  49


0.15743351345126694 0.13348357272999628
Epoch:  50


0.15766502514078812 0.1328114920428821
Epoch:  51


0.1568157032534883 0.13529214901583536
Epoch:  52


0.15517251354617043 0.13108807163579123
Epoch:  53


0.15609371863506935 0.13354482288871491
Epoch:  54


0.15535841479494766 0.1318266945225852
Epoch:  55


0.1549570153693895 0.13434202543326787
Epoch:  56


0.1558184494843354 0.13536152988672256
Epoch:  57


0.15516164294771245 0.18558590752737864
Epoch:  58


0.15540582908166423 0.13385588356426784
Epoch    58: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  59


0.1556855395839021 0.12924496297325408
Epoch:  60


0.1532815872011958 0.12842246890068054
Epoch:  61


0.15228935834523794 0.1287875207407134
Epoch:  62


0.1518083762478184 0.1284524585519518
Epoch:  63


0.15356405361278638 0.12840636713164194
Epoch:  64


0.15106742688127467 0.128164532993521
Epoch:  65


0.15247669373009656 0.12840349333626883
Epoch:  66


0.15151811733439163 0.12796611551727569
Epoch:  67


0.1534677627118858 0.1281995666878564
Epoch:  68


0.15195599077521144 0.1281788232071059
Epoch:  69


0.15144951238825516 0.12810414071593965
Epoch:  70


0.15256872370436386 0.12819366795676096
Epoch:  71


0.15116951030653877 0.12769107946327754
Epoch:  72


0.15208446133781123 0.1278155626995223
Epoch:  73


0.15176256846737218 0.12788920210940496
Epoch:  74


0.1524029245247712 0.12790691001074656
Epoch:  75


0.15168829745537526 0.1277770453265735
Epoch:  76


0.15111741826340958 0.12806453449385508
Epoch:  77


0.1517756742400092 0.12865169559206283
Epoch    77: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  78


0.15127255183619423 0.1280708419425147
Epoch:  79


0.15142047687156782 0.12771719587700708
Epoch:  80


0.1507928798327575 0.1284023800066539
Epoch:  81


0.1512541142669884 0.12746898723500116
Epoch:  82


0.15102795370527217 0.1278147143977029
Epoch:  83


0.15127573383821025 0.12782098352909088
Epoch:  84


0.15091281686280225 0.12740468659571239
Epoch:  85


0.15119068284292478 0.1274950908763068
Epoch:  86


0.1507268986991934 0.1277441339833396
Epoch:  87


0.15200231043068138 0.12775512784719467
Epoch:  88


0.1511156450252275 0.127349496952125
Epoch:  89


0.1501632393211932 0.12777227056877954
Epoch:  90


0.15161957773002419 0.1276118861777442
Epoch:  91


0.15104498774618716 0.12767530658415385
Epoch:  92


0.15141366905457265 0.12787600925990514
Epoch:  93


0.15130675805581584 0.12784401327371597
Epoch:  94


0.1520127996399596 0.12742251477071218
Epoch    94: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  95


0.1495860102208885 0.12764588743448257
Epoch:  96


0.15080429694137057 0.12766858296734945
Epoch:  97


0.15019256198728406 0.1274352999670165
Epoch:  98


0.1510195236753773 0.1274480457816805
Epoch:  99


0.14998099167604703 0.1282054547752653
