In [1]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor



In [2]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [3]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [4]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [5]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [6]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [7]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [8]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [9]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [10]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [11]:
until_x = None

In [12]:
# Parameters
until_x = 1


In [13]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [14]:
# Instantiate the model
model = Task5Model(31).to(device)

In [15]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [16]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6379383077492585 0.5168945108141217
Epoch:  1


0.3372432299562403 0.2025319061108998
Epoch:  2


0.18730094183135676 0.16671061941555568
Epoch:  3


0.17408121316819578 0.15653208536761148
Epoch:  4


0.16977535429838542 0.14965537190437317
Epoch:  5


0.16543902980314718 0.1514736350093569
Epoch:  6


0.16503890503097224 0.14434256298201426
Epoch:  7


0.1629505161498044 0.14888704461710794
Epoch:  8


0.1609924777939513 0.1545862023319517
Epoch:  9


0.1618845289623415 0.14748692086764745
Epoch:  10


0.16017943258221085 0.14399041341883795
Epoch:  11


0.15970612498554024 0.1445723388876234
Epoch:  12


0.15912927203887217 0.1459480162177767
Epoch:  13


0.15936066894917875 0.1365204836641039
Epoch:  14


0.15736893302685506 0.14356527690376555
Epoch:  15


0.15645283500890475 0.1360301109296935
Epoch:  16


0.1570948124737353 0.1357761855636324
Epoch:  17


0.15557176680178256 0.1325812871967043
Epoch:  18


0.1560083144419902 0.13611431952033723
Epoch:  19


0.15494367480278015 0.13512432043041503
Epoch:  20


0.15379132773425128 0.1346398549420493
Epoch:  21


0.15435151030888428 0.13700782294784272
Epoch:  22


0.15558298415428884 0.1334313154220581
Epoch:  23


0.15416869118406967 0.1304343044757843
Epoch:  24


0.15292392993295514 0.13037068503243582
Epoch:  25


0.1527107414361593 0.12947715818881989
Epoch:  26


0.15261962727920428 0.12920477028403962
Epoch:  27


0.15280482938160767 0.13146258677755082
Epoch:  28


0.1513364516400002 0.12970881589821406
Epoch:  29


0.15145277211794983 0.12887737048523767
Epoch:  30


0.1506083076064651 0.12684870937040874
Epoch:  31


0.15109989933065465 0.1309456495302064
Epoch:  32


0.15029048073936152 0.1276014479143279
Epoch:  33


0.1513171743702244 0.127744055220059
Epoch:  34


0.1496812366955989 0.1263384148478508
Epoch:  35


0.14889773202909007 0.12648348616702215
Epoch:  36


0.14971549003510862 0.12809773002352035
Epoch:  37


0.14930265618337168 0.12666614672967366
Epoch:  38


0.14982968367434837 0.1263521984219551
Epoch:  39


0.14793496558795105 0.13165871373244695
Epoch:  40


0.1484778893960489 0.12559861264058522
Epoch:  41


0.14730772456607302 0.13272302065576827
Epoch:  42


0.14870871603488922 0.12541667904172624
Epoch:  43


0.14794352570095579 0.13276588171720505
Epoch:  44


0.1480004388738323 0.13372683418648584
Epoch:  45


0.148866612363506 0.1279608788234847
Epoch:  46


0.14726406698291367 0.12718957769019262
Epoch:  47


0.14750901388155446 0.12730610689946584
Epoch:  48


0.14879477144898595 0.12686976790428162
Epoch    48: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  49


0.14505844945843155 0.12349775859287807
Epoch:  50


0.14435281221931046 0.12300180111612592
Epoch:  51


0.14425521323809753 0.12303379390920911
Epoch:  52


0.14250417979987892 0.1222817844578198
Epoch:  53


0.1425533105392714 0.1229389128940446
Epoch:  54


0.14348465808340022 0.12294065845864159
Epoch:  55


0.1438879137103622 0.12266710719891957
Epoch:  56


0.14426673062749812 0.1222126984170505
Epoch:  57


0.14264421688543782 0.12267801804201943
Epoch:  58


0.14288888106475006 0.12314900330134801
Epoch:  59


0.14196613229609825 0.1220503853900092
Epoch:  60


0.14224018599535967 0.12287360855511256
Epoch:  61


0.14236388576997294 0.12275708253894534
Epoch:  62


0.1425674058295585 0.12245071147169385
Epoch:  63


0.1421239883513064 0.12274347352130073
Epoch:  64


0.14162561377963503 0.1222717719418662
Epoch:  65


0.14155151594329524 0.12284660977976662
Epoch    65: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  66


0.14197647168829636 0.12280047578471047
Epoch:  67


0.14110926357475487 0.12242470043046134
Epoch:  68


0.141750791185611 0.1226590946316719
Epoch:  69


0.14145227139060562 0.1224477110164506
Epoch:  70


0.14237213940233798 0.12267618094171796
Epoch:  71


0.1417347108190124 0.12282555869647435
Epoch    71: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  72


0.14116604585905332 0.12266410780804497
Epoch:  73


0.14124009536730275 0.12251365823405129
Epoch:  74


0.1412611768857853 0.12246036955288478
Epoch:  75


0.14167339254069972 0.12312463990279607
Epoch:  76


0.14128569533696045 0.1226282971245902
Epoch:  77


0.140108221286052 0.12255423835345677
Epoch    77: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  78


0.1412569699255196 0.12302039457218987
Epoch:  79


0.140147785479958 0.1227372373853411
Epoch:  80


0.14144598189237956 0.12282935529947281
Epoch:  81


0.14284440311225685 0.12230528146028519
Epoch:  82


0.14131049288285746 0.12274304351636342
Epoch:  83


0.14045577818477475 0.1229611486196518
Epoch    83: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  84
