In [1]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [2]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [3]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [4]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [5]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [6]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [7]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [8]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [9]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [10]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [11]:
until_x = None

In [12]:
# Parameters
until_x = -1


In [13]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [14]:
# Instantiate the model
model = Task5Model(31).to(device)

In [15]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [16]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6327653237291284 0.4854701927730015
Epoch:  1


0.32067248064118464 0.19094127203736985
Epoch:  2


0.17872285238794378 0.26585044605391367
Epoch:  3


0.1660598061374716 0.14244872012308665
Epoch:  4


0.16095717452667854 0.14543000289372035
Epoch:  5


0.15823878831154592 0.13940095262868063
Epoch:  6


0.15631223812296585 0.1342687181064061
Epoch:  7


0.15541625304802045 0.1395192359175001
Epoch:  8


0.15495085071873022 0.14075971926961625
Epoch:  9


0.15542255744740768 0.1350224528993879
Epoch:  10


0.1546529201236931 0.1292822329061372
Epoch:  11


0.15329581821287 0.13142039520399912
Epoch:  12


0.15142871278363304 0.13101590531212942
Epoch:  13


0.1519981618668582 0.13274863894496644
Epoch:  14


0.15123027563095093 0.1297976097890309
Epoch:  15


0.15195709668301247 0.12799542929444993
Epoch:  16


0.15142221789102298 0.1313356767807688
Epoch:  17


0.15070846717099887 0.129582836159638
Epoch:  18


0.14999813526063352 0.13096544785158976
Epoch:  19


0.1501683967339026 0.13375164568424225
Epoch:  20


0.15204314766703425 0.13112717228276388
Epoch:  21


0.15009180355716395 0.1273845891867365
Epoch:  22


0.15069625949537432 0.1317654081753322
Epoch:  23


0.1498440375199189 0.13350209700209753
Epoch:  24


0.14960377844604286 0.12908248709780829
Epoch:  25


0.14782147995523504 0.13072024072919572
Epoch:  26


0.14805606329763257 0.13482702523469925
Epoch:  27


0.14850229226254127 0.12541605425732477
Epoch:  28


0.14743935417484594 0.12785579796348298
Epoch:  29


0.14743238526421623 0.12565687085900987
Epoch:  30


0.14698533270810102 0.12657060687031066
Epoch:  31


0.14712491631507874 0.12500674171107157
Epoch:  32


0.14684529602527618 0.127739103777068
Epoch:  33


0.14590813098727046 0.1295263490506581
Epoch:  34


0.14683366909220413 0.1276253280895097
Epoch:  35


0.14595249738242175 0.12693148957831518
Epoch:  36


0.14604391238173922 0.1393646661724363
Epoch:  37


0.14577449938735446 0.12811733675854547
Epoch    37: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  38


0.14570151792990194 0.12277560255357198
Epoch:  39


0.1431958276677776 0.12323786850486483
Epoch:  40


0.1426637072015453 0.12687328777142934
Epoch:  41


0.14291189772051735 0.12308474310806819
Epoch:  42


0.14332734652467677 0.12217073142528534
Epoch:  43


0.14157553782334198 0.12152108337197985
Epoch:  44


0.14093582976508784 0.12149019007171903
Epoch:  45


0.14208669477217906 0.12105368609939303
Epoch:  46


0.1422454400642498 0.12196690482752663
Epoch:  47


0.1413983348253611 0.12138453125953674
Epoch:  48


0.14024557535712784 0.12128108526979174
Epoch:  49


0.14103438604522395 0.12192785420588084
Epoch:  50


0.14078178437980446 0.12165584840944835
Epoch:  51


0.14111519463964411 0.1211632809468678
Epoch    51: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  52


0.14072346626906782 0.12118738676820483
Epoch:  53


0.14170226897742297 0.12131656493459429
Epoch:  54


0.13969837209662875 0.12127921091658729
Epoch:  55


0.13963021337985992 0.12091118310179029
Epoch:  56


0.14051756504419688 0.12118011393717357
Epoch:  57


0.14096476419551954 0.1211757819567408
Epoch:  58


0.14085973074307312 0.12117483466863632
Epoch:  59


0.13958821908847704 0.12097959220409393
Epoch:  60


0.14115498235096802 0.12117912513869149
Epoch:  61


0.13902280902540362 0.12138107312577111
Epoch    61: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  62


0.13928859499660698 0.12101270471300397
Epoch:  63


0.14001721183995944 0.12124943413904735
Epoch:  64


0.1392188809207968 0.12121751691613879
Epoch:  65


0.1398486964203216 0.12107057550123759
Epoch:  66


0.14046604448073619 0.12094664999416896
Epoch:  67


0.13938209132568255 0.12113229291779655
Epoch    67: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  68


0.14069557914862763 0.12087090419871467
Epoch:  69


0.14048602391739148 0.12131566767181669
Epoch:  70


0.14076266578725866 0.1218158198254449
Epoch:  71


0.13980459562830022 0.12097934314182826
Epoch:  72


0.13984737404294917 0.12115216893809182
Epoch:  73


0.13899659949379997 0.12099457106419972
Epoch:  74


0.138230212234162 0.1209888585976192
Epoch    74: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  75


0.13927135556130796 0.1211014815739223
Epoch:  76


0.14022392578221657 0.12099773223911013
Epoch:  77


0.13985812100204262 0.12102802736418587
Epoch:  78


0.14112077854775093 0.12145457310335976
Epoch:  79


0.1409154087305069 0.12114003619977406
Epoch:  80


0.1400030202962257 0.12105524327073779
Epoch:  81


0.13967545209704219 0.12222455335514885
Epoch:  82


0.1400417419704231 0.12122091118778501
Epoch:  83


0.13955866867626035 0.12109818096671786
Epoch:  84


0.13971140134978938 0.12120167698178973
Epoch:  85


0.14027823105051712 0.12106919501508985
Epoch:  86


0.14008207156046018 0.12111408795629229
Epoch:  87


0.139496694142754 0.12104926045451846
Epoch:  88


0.13999191087645455 0.12137683161667415
Epoch:  89


0.13935239935243451 0.12124734691211156
Epoch:  90


0.1392787147212673 0.12116440598453794
Epoch:  91


0.14048136972092293 0.12113091030291148
Epoch:  92


0.13960447544987137 0.12114941541637693
Epoch:  93
