In [1]:
# Parameters
until_x = 6


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [12]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [13]:
# Instantiate the model
model = Task5Model(31).to(device)

In [14]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6092059700875669 0.42404247181756155
Epoch:  1


0.2779425934359834 0.19171822922570364
Epoch:  2


0.1796707962010358 0.16220200274671828
Epoch:  3


0.17176646759381164 0.17405991469110763
Epoch:  4


0.168339108695855 0.1493746212550572
Epoch:  5


0.16628657764679677 0.16884573655469076
Epoch:  6


0.163464180923797 0.14391998095171793
Epoch:  7


0.1610351748563148 0.13706089236906596
Epoch:  8


0.1612608855640566 0.1691419460943767
Epoch:  9


0.16030839850773682 0.13230268337896892
Epoch:  10


0.1578779566932369 0.13752453348466329
Epoch:  11


0.15889010155523145 0.13231447764805385
Epoch:  12


0.15666821478186427 0.1288327713097845
Epoch:  13


0.15743572486413493 0.13329715813909257
Epoch:  14


0.1552760842684153 0.1395802412714277
Epoch:  15


0.1539458704961313 0.13516670146158763
Epoch:  16


0.15422242759047328 0.1285086859549795
Epoch:  17


0.1539580282327291 0.1278265118598938
Epoch:  18


0.1542015981835288 0.12883280111210688
Epoch:  19


0.1538502686732524 0.12866159422057016
Epoch:  20


0.15316678100341075 0.13611644825765065
Epoch:  21


0.15265470823726138 0.1297259564910616
Epoch:  22


0.15257439017295837 0.1284483266728265
Epoch:  23


0.15319328896097234 0.1308226010629109
Epoch    23: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  24


0.15031995886080973 0.12399189387048994
Epoch:  25


0.15027080597104253 0.1242076147879873
Epoch:  26


0.14971255329815117 0.12403063156775065
Epoch:  27


0.15056801808846965 0.12410617832626615
Epoch:  28


0.15042340231908335 0.1233523381607873
Epoch:  29


0.1489629556198378 0.12303776293992996
Epoch:  30


0.1491312871913652 0.12297226382153374
Epoch:  31


0.1506628748532888 0.12275490058319909
Epoch:  32


0.14851602750855522 0.12252415184463773
Epoch:  33


0.14901949062540726 0.12286269132580076
Epoch:  34


0.15017485376950857 0.1228426428777831
Epoch:  35


0.1485710583023123 0.12258382673774447
Epoch:  36


0.15042654726956342 0.12287456755127225
Epoch:  37


0.14852455983290802 0.12307318193571908
Epoch:  38


0.14881219815563512 0.12305991564478193
Epoch    38: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  39


0.147658716182451 0.12274864103112902
Epoch:  40


0.14912491996546048 0.12259159130709511
Epoch:  41


0.14829331676702243 0.12282783325229372
Epoch:  42


0.1494148985759632 0.12274920408214841
Epoch:  43


0.14842719081285838 0.12222255659954888
Epoch:  44


0.1482707543953045 0.12249247623341424
Epoch:  45


0.1481546751550726 0.12239227550370353
Epoch:  46


0.14895013578840205 0.1227458609001977
Epoch:  47


0.14928669784520124 0.12226023312125887
Epoch:  48


0.14918280896302816 0.12255626916885376
Epoch:  49


0.1470816429402377 0.1225743932383401
Epoch    49: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  50


0.14825978633519765 0.1219735826764788
Epoch:  51


0.14880084709541216 0.12213263660669327
Epoch:  52


0.14894326474215533 0.12222596683672496
Epoch:  53


0.14699392544256673 0.12214436382055283
Epoch:  54


0.14838384937595678 0.12229624709912709
Epoch:  55


0.1483346392979493 0.12256371123450142
Epoch:  56


0.14740588012579325 0.122163644858769
Epoch    56: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  57


0.14894746163406888 0.1226076175059591
Epoch:  58


0.14867637487682137 0.12196645140647888
Epoch:  59


0.14742690365056735 0.12216434734208244
Epoch:  60


0.14758423172138832 0.1221390717795917
Epoch:  61


0.14796317429155917 0.12201498874596187
Epoch:  62


0.14805960212204908 0.12231652012893132
Epoch    62: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  63


0.1479001955406086 0.12231245849813734
Epoch:  64


0.14718369213310448 0.12216162255832128
Epoch:  65


0.1485231265022948 0.12216104992798396
Epoch:  66


0.14776977252315832 0.12237000678266798
Epoch:  67


0.14801338476103706 0.12210426160267421
Epoch:  68


0.1490399466978537 0.12226561031171254
Epoch:  69


0.14650192292960915 0.1221244867358889
Epoch:  70


0.14721909487569654 0.12228454649448395
Epoch:  71


0.14701387729193713 0.12200580856629781
Epoch:  72


0.14828708969257973 0.12215899463210787
Epoch:  73


0.14838573296327848 0.12236591215644564
Epoch:  74


0.14805897264867215 0.12231214450938362
Epoch:  75


0.14920515103920087 0.12235175392457417
Epoch:  76


0.14841183013207204 0.12214824663741249
Epoch:  77


0.14816795168696223 0.12237056983368737
Epoch:  78


0.14833261797557007 0.12217934748956136
Epoch:  79


0.14837485874021375 0.12211663914578301
Epoch:  80


0.14778323149358905 0.12224296161106654
Epoch:  81


0.14800536330487277 0.12207672851426261
Epoch:  82


0.1479721697601112 0.1222049349120685
Epoch:  83
