In [1]:
# Parameters
until_x = 13


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [12]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [13]:
# Instantiate the model
model = Task5Model(31).to(device)

In [14]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.615020173626977 0.49450914348874775
Epoch:  1


0.28965021589317835 0.19518407114914485
Epoch:  2


0.192117042235426 0.1770170501300267
Epoch:  3


0.18537734368362943 0.18136226492268698
Epoch:  4


0.18463657595015862 0.17311965780598776
Epoch:  5


0.184344507552482 0.16885788525853837
Epoch:  6


0.1824534829403903 0.17213848658970424
Epoch:  7


0.18416390467334437 0.17756982999188559
Epoch:  8


0.1829594426058434 0.17796415942055838
Epoch:  9


0.18099244341656967 0.17212438796247756
Epoch:  10


0.18012176051333145 0.1683175883122853
Epoch:  11


0.1797209229018237 0.1670752934047154
Epoch:  12


0.17749726409847671 0.17316881460802896
Epoch:  13


0.17803314287920255 0.1612183983836855
Epoch:  14


0.17573757872388168 0.15807425762925828
Epoch:  15


0.17417176309469584 0.15703896113804408
Epoch:  16


0.17442894586034724 0.1572046194757734
Epoch:  17


0.1726866006045728 0.15353053276027953
Epoch:  18


0.17162136733531952 0.1641957951443536
Epoch:  19


0.16655686457414884 0.14626929376806533
Epoch:  20


0.164685863900829 0.14373102145535605
Epoch:  21


0.16395353626560522 0.17529331360544478
Epoch:  22


0.16333484045557073 0.1407268749816077
Epoch:  23


0.1614929087258674 0.1445963829755783
Epoch:  24


0.1583389881494883 0.13484524190425873
Epoch:  25


0.1581041031592601 0.13849833075489318
Epoch:  26


0.1566074200578638 0.13106560919966018
Epoch:  27


0.15655909196750536 0.1322087879691805
Epoch:  28


0.15564594880954638 0.13833026268652507
Epoch:  29


0.15482026459397497 0.1309710687824658
Epoch:  30


0.15374095899027748 0.130068234034947
Epoch:  31


0.1537903349947285 0.12686470470258168
Epoch:  32


0.15338917036314267 0.13321063028914587
Epoch:  33


0.15234548860305064 0.17149670634950911
Epoch:  34


0.15125262576180534 0.12731404602527618
Epoch:  35


0.15228524240287575 0.13009807573897497
Epoch:  36


0.15030433157005826 0.12875180797917501
Epoch:  37


0.15112775685013952 0.12875971517392568
Epoch    37: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  38


0.15056232423395724 0.1248797955257552
Epoch:  39


0.14995745146596753 0.12402555878673281
Epoch:  40


0.14827458641013583 0.1238683249269213
Epoch:  41


0.14809965483240178 0.12380025748695646
Epoch:  42


0.14890976330718478 0.12369836334671293
Epoch:  43


0.14790886278088028 0.1230570205620357
Epoch:  44


0.1474640349278579 0.12323627833809171
Epoch:  45


0.14766459610011126 0.12392978370189667
Epoch:  46


0.14655670241729632 0.122772430734975
Epoch:  47


0.1475597559600263 0.12322080880403519
Epoch:  48


0.1463564106741467 0.12328779910291944
Epoch:  49


0.14648169077731468 0.12332899229867118
Epoch:  50


0.1467036089381656 0.12317722397191185
Epoch:  51


0.14767003381574476 0.123350661780153
Epoch:  52


0.14700065553188324 0.12337743278060641
Epoch    52: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  53


0.14610326612317884 0.12318983141865049
Epoch:  54


0.1476119003585867 0.12299344156469617
Epoch:  55


0.1460819735720351 0.12272761123520988
Epoch:  56


0.14792351626061104 0.12308716028928757
Epoch:  57


0.14612644346984657 0.12293360701629094
Epoch:  58


0.14664438568256996 0.12291176510708672
Epoch:  59


0.14619185433194443 0.12295099028519221
Epoch:  60


0.1463110233480866 0.12237179385764259
Epoch:  61


0.1455483166752635 0.12293841583388192
Epoch:  62


0.14617411913098516 0.12289237124579293
Epoch:  63


0.1459446819247426 0.12317633841718946
Epoch:  64


0.14524680053865588 0.12273790261575154
Epoch:  65


0.14622124989290494 0.12285319396427699
Epoch:  66


0.14586590029097893 0.12309856712818146
Epoch    66: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  67


0.14602927420590375 0.12286150455474854
Epoch:  68


0.14572357970315056 0.12301839994532722
Epoch:  69


0.14570379136381922 0.12289239040442876
Epoch:  70


0.145417155446233 0.12280856498650142
Epoch:  71


0.14631160650704358 0.12303291686943599
Epoch:  72


0.14615424178742073 0.1226438922541482
Epoch    72: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  73


0.14538936115599968 0.12285795382090978
Epoch:  74


0.14558974834712776 0.12318607100418635
Epoch:  75


0.1458087370202348 0.12312656215259007
Epoch:  76


0.14660153719219002 0.12274894756930214
Epoch:  77


0.1463318048296748 0.12290484351771218
Epoch:  78


0.14661929655719447 0.12299681667770658
Epoch    78: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  79


0.1461646291049751 0.122909135052136
Epoch:  80


0.14587507215706078 0.1227884516119957
Epoch:  81


0.1457129712845828 0.1228704686675753
Epoch:  82


0.14570150101507032 0.12291823966162545
Epoch:  83


0.14636910767168612 0.12294187503201622
Epoch:  84


0.14643801426565325 0.12300595108951841
Epoch:  85
