In [1]:
# Parameters
until_x = 8


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [12]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [13]:
# Instantiate the model
model = Task5Model(31).to(device)

In [14]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6070826254986428 0.4663582018443516
Epoch:  1


0.2751658413861249 0.19002152979373932
Epoch:  2


0.17858856473420118 0.288054496049881
Epoch:  3


0.1706526605664073 0.14832401062761033
Epoch:  4


0.1668559123535414 0.168265306523868
Epoch:  5


0.16334595108354413 0.13993373193911143
Epoch:  6


0.1593118672435348 0.1417429425886699
Epoch:  7


0.15888946885998184 0.14100143313407898
Epoch:  8


0.1574503247802322 0.13794164040258952
Epoch:  9


0.15699178826164556 0.1364193090370723
Epoch:  10


0.1555783072033444 0.13719814590045384
Epoch:  11


0.15429057946076263 0.13348260202578136
Epoch:  12


0.15500928421278257 0.1300088827099119
Epoch:  13


0.15260067743224068 0.13153447530099324
Epoch:  14


0.1537678161182919 0.13303904873984201
Epoch:  15


0.15533196322015813 0.12900014328105108
Epoch:  16


0.15320710112919678 0.13346176807369506
Epoch:  17


0.1519820287420943 0.12915427876370295
Epoch:  18


0.1515971003351985 0.13632505387067795
Epoch:  19


0.15240691682776888 0.12821269461086818
Epoch:  20


0.151743602108311 0.1324930829661233
Epoch:  21


0.1514157338722332 0.13503957114049367
Epoch:  22


0.15095990292123845 0.12850588772978103
Epoch:  23


0.15018102848852002 0.12872332973139627
Epoch:  24


0.14860548924755407 0.12839898041316442
Epoch:  25


0.14954269657263886 0.1277999452182225
Epoch:  26


0.14888432903869733 0.12767193147114345
Epoch:  27


0.14926611209237897 0.1271232377205576
Epoch:  28


0.14887179877306964 0.12913372154746736
Epoch:  29


0.14933220800515767 0.13082031267029898
Epoch:  30


0.1475204358229766 0.12478518273149218
Epoch:  31


0.1480377892384658 0.12820036283561162
Epoch:  32


0.14835664670209628 0.12560720635311945
Epoch:  33


0.1480924213254774 0.12671470854963576
Epoch:  34


0.14772635053943944 0.12855521270206996
Epoch:  35


0.1478628657154135 0.13125972343342646
Epoch:  36


0.14805477978409948 0.13002042578799383
Epoch    36: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  37


0.14700805657618754 0.12321562319993973
Epoch:  38


0.14451895452834465 0.12322516632931572
Epoch:  39


0.14548168311247955 0.12273134078298296
Epoch:  40


0.14500934046667976 0.15457419838224137
Epoch:  41


0.14486965537071228 0.12321502715349197
Epoch:  42


0.14475342228605942 0.12226488121918269
Epoch:  43


0.14305057235666224 0.1230512729712895
Epoch:  44


0.14449847308365074 0.1229344510606357
Epoch:  45


0.1435493029452659 0.12262175870793206
Epoch:  46


0.14401201460812543 0.12251173917736326
Epoch:  47


0.14295580540154432 0.12347159002508436
Epoch:  48


0.14383745555942123 0.12242247377123151
Epoch    48: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  49


0.1434764511681892 0.12223014661243983
Epoch:  50


0.14347150841274778 0.12229809590748378
Epoch:  51


0.14299060605667732 0.12252513745001384
Epoch:  52


0.14414493256324046 0.12233661221606391
Epoch:  53


0.14255615021731402 0.12212831846305303
Epoch:  54


0.14276170569497185 0.1224666993532862
Epoch:  55


0.14406629067820473 0.12235587622438159
Epoch:  56


0.14320231692211047 0.12217523051159722
Epoch:  57


0.14316202096036962 0.12221380855355944
Epoch:  58


0.1430883246499139 0.1220736067209925
Epoch:  59


0.14296185366205266 0.12195550331047603
Epoch:  60


0.14349739213247556 0.12249096376555306
Epoch:  61


0.1437071549731332 0.12246733052389962
Epoch:  62


0.14333803629552996 0.12212591511862618
Epoch:  63


0.14270643104572553 0.12232888489961624
Epoch:  64


0.14226444146117648 0.12212376935141427
Epoch:  65


0.14267134022068334 0.12230879706995827
Epoch    65: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  66


0.1431559743107976 0.12213266108717237
Epoch:  67


0.14261760018967293 0.12229401511805398
Epoch:  68


0.14360904975517377 0.12242122845990318
Epoch:  69


0.1428358699824359 0.1224610305258206
Epoch:  70


0.1427748469082085 0.12226647990090507
Epoch:  71


0.14336335981214368 0.12207782162087304
Epoch    71: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  72


0.14328381660822276 0.1220590855394091
Epoch:  73


0.14205116959842476 0.12233869305678777
Epoch:  74


0.14285049204890793 0.12227024883031845
Epoch:  75


0.14271243520685145 0.1223902063710349
Epoch:  76


0.14324535308657466 0.12228540224688393
Epoch:  77


0.1423497932988244 0.1222534818308694
Epoch    77: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  78


0.14337347850606247 0.12218176254204341
Epoch:  79


0.14321371469948743 0.12212901668889183
Epoch:  80


0.14331566844437574 0.12209608831575938
Epoch:  81


0.1418665172280492 0.12224306379045759
Epoch:  82


0.1436854226363672 0.12223469891718455
Epoch:  83


0.14205695084623388 0.12222004788262504
Epoch:  84
