In [1]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor



In [2]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [3]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [4]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [5]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [6]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [7]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [8]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [9]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [10]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [11]:
until_x = None

In [12]:
# Parameters
until_x = 18


In [13]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [14]:
# Instantiate the model
model = Task5Model(31).to(device)

In [15]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [16]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6308783808270017 0.4972983045237405
Epoch:  1


0.3154272271974667 0.25268983628068653
Epoch:  2


0.19268887107436722 0.2276868990489415
Epoch:  3


0.1813040045467583 0.18850829771586827
Epoch:  4


0.1791165479131647 0.1666381252663476
Epoch:  5


0.17511524139223872 0.17118155104773386
Epoch:  6


0.17573701247975632 0.1680612713098526
Epoch:  7


0.17436808307428617 0.16573089361190796
Epoch:  8


0.17341460569484815 0.16249398248536245
Epoch:  9


0.1711363784364752 0.1580936163663864
Epoch:  10


0.17031593701323947 0.15121049753257207
Epoch:  11


0.17083263518037023 0.15551709277289255
Epoch:  12


0.1697979939950479 0.15169700128691538
Epoch:  13


0.16844327788095217 0.15454564137118204
Epoch:  14


0.16928055036712336 0.1520626757826124
Epoch:  15


0.1687735411766413 0.16514721938541957
Epoch:  16


0.16825679427868612 58.9619380405971
Epoch    16: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  17


0.16584750325293154 0.1457867824605533
Epoch:  18


0.1641848824314169 0.1441519015601703
Epoch:  19


0.16499295589086171 0.14357334162507737
Epoch:  20


0.16206881363649625 0.14272222774369375
Epoch:  21


0.16365867128243317 0.1431898112807955
Epoch:  22


0.16434342152363546 0.14194762387445994
Epoch:  23


0.16231857763754354 0.14155362440007074
Epoch:  24


0.16276689679236025 0.14254270600421087
Epoch:  25


0.16367274120047287 0.14136040530034474
Epoch:  26


0.16266700262958939 0.14126900264195033
Epoch:  27


0.16423026049459302 0.14051022487027304
Epoch:  28


0.16287606390746864 0.13996563958270208
Epoch:  29


0.16199594977739695 0.13953096951757157
Epoch:  30


0.16216537275829832 0.14036452025175095
Epoch:  31


0.16365664955731984 0.1401356224502836
Epoch:  32


0.1629120562527631 0.14010068348475865
Epoch:  33


0.16176422786068273 0.13915481844118663
Epoch:  34


0.16224226395826083 0.1397945550935609
Epoch:  35


0.16186034598866025 0.14018490484782628
Epoch:  36


0.16171428480663816 0.1392158716917038
Epoch:  37


0.16166578515155897 0.13866711195026124
Epoch:  38


0.16114457233532056 0.13793797471693584
Epoch:  39


0.16234309165864377 0.13843948394060135
Epoch:  40


0.16194810295427167 0.14060818723269872
Epoch:  41


0.16045559217800964 0.13763556522982462
Epoch:  42


0.1605141380348721 0.1387526680316244
Epoch:  43


0.16155673724574013 0.1377579112138067
Epoch:  44


0.16055349159885096 0.13961871500526155
Epoch:  45


0.1615724732746949 0.1398358621767589
Epoch:  46


0.16063310326756658 0.13726810472352163
Epoch:  47


0.15959123824093793 0.139397317809718
Epoch:  48


0.16097410587040153 0.13708744730268205
Epoch:  49


0.16130289757573926 0.13992927755628312
Epoch:  50


0.161054799282873 0.13791358258043016
Epoch:  51


0.1608030131539783 0.13736043657575334
Epoch:  52


0.16077512422123472 0.13720614782401494
Epoch:  53


0.16045713787143295 0.13724627771547862
Epoch:  54


0.15971385828546575 0.13589057326316833
Epoch:  55


0.15952398527312922 0.13630068515028274
Epoch:  56


0.15957125014549978 0.13659308637891496
Epoch:  57


0.16000010798106323 0.13647806644439697
Epoch:  58


0.16017290224900116 0.13619006212268556
Epoch:  59


0.15907646997554883 0.13601551204919815
Epoch:  60


0.159998772514833 0.136356743318694
Epoch    60: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  61


0.15819972834071597 0.1347292001758303
Epoch:  62


0.15751169219210343 0.13423840701580048
Epoch:  63


0.1585757833074879 0.13509496194975718
Epoch:  64


0.15681406775036374 0.1339009925723076
Epoch:  65


0.15857096620508143 0.13422169110604695
Epoch:  66


0.1571075392735971 0.1340556421450206
Epoch:  67


0.15824046328261093 0.13402200596673147
Epoch:  68


0.1581994214573422 0.13393590173551015
Epoch:  69


0.158221586733251 0.13397483208349772
Epoch:  70


0.15790210422631856 0.13406554077352797
Epoch    70: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  71


0.1587301794741605 0.13410905855042593
Epoch:  72


0.15785889488619728 0.13382725843361445
Epoch:  73


0.15825238944710912 0.1341911181807518
Epoch:  74


0.15954842317748713 0.13465834834745952
Epoch:  75


0.1584435914819305 0.1345102254833494
Epoch:  76


0.15836632493379954 0.1347314225775855
Epoch:  77


0.15801754271661914 0.13397833172764098
Epoch:  78


0.15862670822723493 0.13379019711698806
Epoch:  79


0.15839212609303965 0.13369839744908468
Epoch:  80


0.15922787704983274 0.13427469453641347
Epoch:  81


0.1571304371228089 0.13416393207652227
Epoch:  82


0.1586177059122034 0.13412940821477345
Epoch:  83


0.1590682924599261 0.1339388063975743
Epoch:  84


0.1589529337109746 0.13429645129612514
Epoch:  85


0.1592821801030958 0.1341638479913984
Epoch    85: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  86


0.15823687894924268 0.13419833140713827
Epoch:  87


0.15818040556198842 0.13359196377652033
Epoch:  88


0.15888324259100733 0.1348478091614587
Epoch:  89


0.15966914835813883 0.13357249540942057
Epoch:  90


0.158798574595838 0.1340402833053044
Epoch:  91


0.1583010332809912 0.13361431338957377
Epoch:  92


0.15774421434144717 0.1337452701159886
Epoch:  93


0.15829807399092494 0.133944177201816
Epoch:  94


0.1580378219888017 0.1335973303232874
Epoch:  95


0.15914603826161977 0.13398135985646928
Epoch    95: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  96


0.15738720789149002 0.133947776896613
Epoch:  97


0.15773547178990133 0.13447210299117224
Epoch:  98


0.1588833992545669 0.13475126773118973
Epoch:  99


0.15736070436400337 0.13386029324361257
