In [1]:
# Parameters
until_x = 1


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]


def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()


class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 96, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 96, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 96, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
# Instantiate the model
model = Task5Model(31).to(device)

In [12]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [13]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 15:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6527768063545227 0.5959884405136109
Epoch:  1


0.4364922976493835 0.31329525709152223
Epoch:  2


0.2222127866744995 0.18642024099826812
Epoch:  3


0.17267000257968904 0.14298724234104157
Epoch:  4


0.16425762832164764 0.20876253843307496
Epoch:  5


0.16084428608417511 0.1393816739320755
Epoch:  6


0.15897611498832703 0.13612416237592698
Epoch:  7


0.15808071792125702 0.13238544762134552
Epoch:  8


0.15616374135017394 0.13452584892511368
Epoch:  9


0.15561886608600617 0.13238695710897447
Epoch:  10


0.1542723435163498 0.13339341580867767
Epoch:  11


0.15460250973701478 0.12997791916131973
Epoch:  12


0.15409177660942078 0.13478495478630065
Epoch:  13


0.1542569774389267 0.12721021771430968
Epoch:  14


0.15130779564380645 0.12993773818016052
Epoch:  15


0.15275776863098145 0.13070557564496993
Epoch:  16


0.1512430065870285 0.13080199658870698
Epoch:  17


0.15056390166282654 0.13153039962053298
Epoch:  18


0.14994892001152038 0.12690470814704896
Epoch:  19


0.1506158411502838 0.13127315789461136
Epoch:  20


0.14982010006904603 0.12586234211921693
Epoch:  21


0.15141784489154816 0.1271948605775833
Epoch:  22


0.1496848577260971 0.12752859890460969
Epoch:  23


0.1482236635684967 0.12606885880231858
Epoch:  24


0.14896449327468872 0.12887126952409744
Epoch:  25


0.14978065013885497 0.1260913461446762
Epoch:  26


0.14936013340950013 0.12839963734149934
Epoch    26: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  27


0.14678397238254548 0.12342554479837417
Epoch:  28


0.14609489142894744 0.1225383996963501
Epoch:  29


0.14638493537902833 0.12222255021333694
Epoch:  30


0.14492917954921722 0.1228392705321312
Epoch:  31


0.14531787157058715 0.12233552634716034
Epoch:  32


0.14498609125614167 0.12200709730386734
Epoch:  33


0.14496105790138245 0.1219923198223114
Epoch:  34


0.14500589668750763 0.12205486595630646
Epoch:  35


0.14563123285770416 0.12180385738611221
Epoch:  36


0.1442861747741699 0.12215707153081894
Epoch:  37


0.1438105148077011 0.12229826748371124
Epoch:  38


0.14521510303020477 0.12212409377098084
Epoch:  39


0.14346540868282318 0.12184568494558334
Epoch:  40


0.14406113922595978 0.12238852977752686
Epoch:  41


0.1422027599811554 0.1221760094165802
Epoch    41: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  42


0.14337196409702302 0.12200997918844222
Epoch:  43


0.1444625598192215 0.12158266007900238
Epoch:  44


0.14281342089176177 0.12179668098688126
Epoch:  45


0.14384787440299987 0.12183951139450074
Epoch:  46


0.1426030731201172 0.12184204161167145
Epoch:  47


0.14311861157417297 0.12187174260616303
Epoch:  48


0.14243949472904205 0.12182490527629852
Epoch:  49


0.14294202625751495 0.12180411964654922
Epoch    49: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  50


0.1431881719827652 0.12189134806394578
Epoch:  51


0.14307063698768616 0.12213281244039535
Epoch:  52


0.1434309470653534 0.12152670472860336
Epoch:  53


0.1436363172531128 0.12172593772411347
Epoch:  54


0.14254827558994293 0.12195030450820923
Epoch:  55


0.14291058242321014 0.12215273082256317
Epoch:  56


0.1430988597869873 0.12166579961776733
Epoch:  57


0.1439825028181076 0.12177879959344864
Epoch:  58


0.14242317736148835 0.12202430665493011
Epoch    58: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  59


0.14349777340888978 0.12166073620319366
Epoch:  60


0.14357424020767212 0.12151083499193191
Epoch:  61


0.1432945066690445 0.12172993421554565
Epoch:  62


0.1428807020187378 0.12189385741949081
Epoch:  63


0.1438635891675949 0.12178675532341003
Epoch:  64


0.143439439535141 0.12167374044656754
Epoch:  65


0.14236221551895142 0.12178486734628677
Epoch:  66


0.14455737233161925 0.12191307246685028
Epoch    66: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  67


0.1436868566274643 0.12189875692129135
Epoch:  68


0.14248355805873872 0.1217454954981804
Epoch:  69


0.14293111085891724 0.1219968095421791
Epoch:  70


0.14334188938140868 0.12177640795707703
Epoch:  71


0.14405757308006287 0.12158931195735931
Epoch:  72


0.14217594802379607 0.12181987464427949
Epoch:  73


0.14274083852767944 0.12202064841985702
Epoch:  74


0.14274995326995848 0.12186417281627655
Epoch:  75
