In [1]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor



In [2]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [3]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [4]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [5]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [6]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [7]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [8]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [9]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [10]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [11]:
until_x = None

In [12]:
# Parameters
until_x = 9


In [13]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [14]:
# Instantiate the model
model = Task5Model(31).to(device)

In [15]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [16]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6313351988792419 0.4889318517276219
Epoch:  1


0.3237071166167388 0.2031132983309882
Epoch:  2


0.19155254517052625 0.17982525484902517
Epoch:  3


0.1814498941640596 0.1766017015491213
Epoch:  4


0.1792060338162087 0.16649878450802394
Epoch:  5


0.17676981035116557 0.16619722758020675
Epoch:  6


0.17556750814656955 0.16021984602723802
Epoch:  7


0.17406746624289332 0.16185540173734939
Epoch:  8


0.17096810485865618 0.1637883824961526
Epoch:  9


0.1705604541946102 0.15983238177640097
Epoch:  10


0.1700943084181966 0.15955304035118648
Epoch:  11


0.16844600116884387 0.1737105952841895
Epoch:  12


0.16802098984653885 0.15219153463840485
Epoch:  13


0.16577941743103233 0.1497039943933487
Epoch:  14


0.16654194730359154 0.14854905435017177
Epoch:  15


0.16389478702802915 0.1414520868233272
Epoch:  16


0.16290838815070488 0.14349799709660666
Epoch:  17


0.16289669516924266 0.14295251561062677
Epoch:  18


0.16254637329965024 0.13989891644035066
Epoch:  19


0.16168359083098335 0.13747638357537134
Epoch:  20


0.1614876598925204 0.14032742700406484
Epoch:  21


0.1625235410960945 0.15195302665233612
Epoch:  22


0.1606371769228497 0.13940110696213587
Epoch:  23


0.15809670654503075 0.13705926069191524
Epoch:  24


0.15982204756221255 0.13613506938729966
Epoch:  25


0.15969405142036644 0.13406718628747122
Epoch:  26


0.15931407342085968 0.1385458399142538
Epoch:  27


0.15758599542282722 0.13502319157123566
Epoch:  28


0.15802137191231186 0.13548741702522551
Epoch:  29


0.15754413564462919 0.1337091475725174
Epoch:  30


0.15777251446569288 0.13602449532066072
Epoch:  31


0.1586062916227289 0.13769102628741944
Epoch:  32


0.1571920973223609 0.13540409186056682
Epoch:  33


0.15580243194425428 0.13243396473782404
Epoch:  34


0.15687865904859594 0.13418871377195632
Epoch:  35


0.15666039610231244 0.13642058415072306
Epoch:  36


0.15664924721460086 0.13169962061303003
Epoch:  37


0.1554069921777055 0.1307200597865241
Epoch:  38


0.15507178894571355 0.1323004737496376
Epoch:  39


0.15567306127097155 0.1320453520332064
Epoch:  40


0.15606804473980054 0.13584854985986436
Epoch:  41


0.15547712630516775 0.13644047507217952
Epoch:  42


0.15572859830147512 0.135879071695464
Epoch:  43


0.155822189273061 0.13345853132861002
Epoch    43: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  44


0.15380180043143196 0.12759238375084742
Epoch:  45


0.1518462034496101 0.12688273723636354
Epoch:  46


0.15201924700994748 0.12657133702720916
Epoch:  47


0.15210706881574682 0.1256487731422697
Epoch:  48


0.1515137778746115 0.12591440975666046
Epoch:  49


0.15277919737068382 0.12629329626049315
Epoch:  50


0.15123014192323428 0.1257492239986147
Epoch:  51


0.15218664423839465 0.12616501535688127
Epoch:  52


0.15159698072317485 0.12589635699987411
Epoch:  53


0.15152754815849098 0.12582625874451228
Epoch    53: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  54


0.14942354688773285 0.12593349175793783
Epoch:  55


0.15135389365054466 0.1257909174476351
Epoch:  56


0.1514549219125026 0.12569309451750346
Epoch:  57


0.15146689519688888 0.12593041679688863
Epoch:  58


0.1509289040758803 0.12586770632437297
Epoch:  59


0.15045652155940598 0.12543061482054846
Epoch:  60


0.15020655901045413 0.12537583708763123
Epoch:  61


0.15011913067585714 0.1253482582313674
Epoch:  62


0.15052380755140976 0.12537834261144912
Epoch:  63


0.15011136636540695 0.1257213375398091
Epoch:  64


0.14986701954055476 0.1251138714807374
Epoch:  65


0.1490183380004522 0.1256782157080514
Epoch:  66


0.1489197901777319 0.12502803547041758
Epoch:  67


0.15052955980236465 0.12592516945941107
Epoch:  68


0.1500954406486975 0.12489141949585505
Epoch:  69


0.15057082796418989 0.12611251963036402
Epoch:  70


0.14929686043713544 0.12545564132077353
Epoch:  71


0.15035990806850227 0.12503226633582795
Epoch:  72


0.14940426921522296 0.1254467304263796
Epoch:  73


0.15099868178367615 0.1253901177218982
Epoch:  74


0.15000436998702385 0.1251165196299553
Epoch    74: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  75


0.15074853156064008 0.12554444266217096
Epoch:  76


0.15077016039474592 0.1257984180535589
Epoch:  77


0.15140353948683352 0.12585431975977762
Epoch:  78


0.1500086872964292 0.1251669909272875
Epoch:  79


0.14960307567506223 0.1251720509358815
Epoch:  80


0.1503042715626794 0.12505532801151276
Epoch    80: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  81


0.15141614868834213 0.125178191278662
Epoch:  82


0.15145921384966052 0.1251057450260435
Epoch:  83


0.15075123189268885 0.12586962218795503
Epoch:  84


0.1506137835818368 0.12570910900831223
Epoch:  85


0.15097594543083295 0.12547427202974046
Epoch:  86


0.15041671894692085 0.1253003722855023
Epoch    86: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  87


0.15129793455471863 0.12518075747149332
Epoch:  88


0.14917617593262647 0.12510215597493307
Epoch:  89


0.15210205517910622 0.12555581544126784
Epoch:  90


0.1503661371566154 0.12546347826719284
Epoch:  91


0.15096388636408625 0.12578362439359939
Epoch:  92


0.15126791354772207 0.1254368456346648
Epoch:  93
