In [1]:
# Parameters
until_x = 10


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 96, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 96, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 96, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [12]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [13]:
# Instantiate the model
model = Task5Model(31).to(device)

In [14]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.652933840751648 0.5678707599639893
Epoch:  1


0.4328102517127991 0.29412625432014466
Epoch:  2


0.22398923754692077 0.170567986369133
Epoch:  3


0.17619595408439637 0.20377818048000335
Epoch:  4


0.16843807816505432 0.19117692112922668
Epoch:  5


0.16254132628440857 0.2765267938375473
Epoch:  6


0.159590026140213 0.18438678681850434
Epoch:  7


0.1588737803697586 0.13854212462902069
Epoch:  8


0.15762628078460694 0.1352420538663864
Epoch:  9


0.15636859476566314 0.13303709775209427
Epoch:  10


0.1561027455329895 0.7250181317329407
Epoch:  11


0.15563232719898223 0.1321301504969597
Epoch:  12


0.15368171155452728 0.1304759532213211
Epoch:  13


0.15278027355670928 0.12787943482398986
Epoch:  14


0.15253106594085694 0.13229403346776963
Epoch:  15


0.15218450725078583 0.13012757599353791
Epoch:  16


0.15285633623600006 0.13041455447673797
Epoch:  17


0.1522119563817978 0.12996017634868623
Epoch:  18


0.15035975396633147 0.13015670329332352
Epoch:  19


0.1501526004076004 0.13419768661260606
Epoch    19: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  20


0.1489493316411972 0.1252112716436386
Epoch:  21


0.1476941692829132 0.12399609684944153
Epoch:  22


0.148625904917717 0.12434019893407822
Epoch:  23


0.1482161045074463 0.1240425780415535
Epoch:  24


0.14675539910793303 0.12372710853815079
Epoch:  25


0.1475452184677124 0.12376667857170105
Epoch:  26


0.1456567144393921 0.12368785291910171
Epoch:  27


0.1471008425951004 0.12330656349658967
Epoch:  28


0.14766456663608551 0.12320807874202729
Epoch:  29


0.1471475499868393 0.12355474531650543
Epoch:  30


0.14659957587718964 0.12372515201568604
Epoch:  31


0.14535517930984498 0.12335140854120255
Epoch:  32


0.14648600280284882 0.12322305887937546
Epoch:  33


0.14559015691280364 0.12286836504936219
Epoch:  34


0.14576510787010194 0.1233107715845108
Epoch:  35


0.14516855239868165 0.12269956469535828
Epoch:  36


0.14574909150600435 0.12320418506860734
Epoch:  37


0.14724412381649019 0.12304857522249221
Epoch:  38


0.14605524480342866 0.1238656148314476
Epoch:  39


0.14497995972633362 0.12294232696294785
Epoch:  40


0.14462896645069123 0.12200094610452653
Epoch:  41


0.1450958865880966 0.12317503839731217
Epoch:  42


0.14541152596473694 0.12265082448720932
Epoch:  43


0.14528515458106994 0.12280433177947998
Epoch:  44


0.145257847905159 0.12234734296798706
Epoch:  45


0.14423177361488343 0.12245545983314514
Epoch:  46


0.1441459721326828 0.12246129959821701
Epoch    46: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  47


0.14559498488903044 0.12202643901109696
Epoch:  48


0.14399028718471527 0.12196512520313263
Epoch:  49


0.14521781027317046 0.12195520550012588
Epoch:  50


0.1441948276758194 0.12209385186433792
Epoch:  51


0.14487308859825135 0.12217875421047211
Epoch:  52


0.1445450085401535 0.12210257053375244
Epoch:  53


0.14443575263023375 0.12222146987915039
Epoch:  54


0.1453468370437622 0.1218696415424347
Epoch:  55


0.14321241319179534 0.12194669544696808
Epoch:  56


0.14533762753009796 0.12211599051952363
Epoch:  57


0.14489744186401368 0.12190572917461395
Epoch:  58


0.14491622805595397 0.12211730629205704
Epoch:  59


0.1442340701818466 0.12234941124916077
Epoch:  60


0.14471248745918275 0.12228719592094421
Epoch    60: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  61


0.14444311559200287 0.12224340438842773
Epoch:  62


0.14308735549449922 0.12217847108840943
Epoch:  63


0.1446623283624649 0.12227382510900497
Epoch:  64


0.14421548187732697 0.1221359834074974
Epoch:  65


0.1439756667613983 0.12237370610237122
Epoch:  66


0.14442903995513917 0.12220597863197327
Epoch    66: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  67


0.1436991101503372 0.12217498123645783
Epoch:  68


0.14459516763687133 0.12200532704591752
Epoch:  69


0.1442811393737793 0.12242523282766342
Epoch:  70


0.14352726578712463 0.12227731943130493
Epoch:  71


0.1446259707212448 0.12221871018409729
Epoch:  72


0.1447085267305374 0.12213148474693299
Epoch    72: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  73


0.1438236266374588 0.12227271795272827
Epoch:  74


0.1439890468120575 0.12229234129190444
Epoch:  75


0.1442284369468689 0.12237948030233384
Epoch:  76


0.1442301881313324 0.12196139246225357
Epoch:  77


0.1436234337091446 0.12208400517702103
Epoch:  78


0.1444735449552536 0.12228287011384964
Epoch:  79
