In [1]:
# Parameters
until_x = 13


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]


def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()


class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 96, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 96, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 96, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
# Instantiate the model
model = Task5Model(31).to(device)

In [12]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [13]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 15:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6500606918334961 0.49395876526832583
Epoch:  1


0.4198097312450409 0.2641228258609772
Epoch:  2


0.213970103263855 0.19594377875328065
Epoch:  3


0.17025273978710176 0.1462110012769699
Epoch:  4


0.1615547376871109 0.19010944664478302
Epoch:  5


0.1598819375038147 0.14384637475013734
Epoch:  6


0.1554397761821747 0.1587806075811386
Epoch:  7


0.154576376080513 0.1360856920480728
Epoch:  8


0.1537974578142166 0.13201879113912582
Epoch:  9


0.1517464315891266 0.12929459363222123
Epoch:  10


0.1520664417743683 0.13275909125804902
Epoch:  11


0.15168411493301392 0.13158257752656938
Epoch:  12


0.15101939916610718 0.12796063125133514
Epoch:  13


0.15040647983551025 0.12674696892499923
Epoch:  14


0.15030824780464172 0.12645529359579086
Epoch:  15


0.14908994257450103 0.12828727066516876
Epoch:  16


0.14826040744781493 0.13017411679029464
Epoch:  17


0.1474839073419571 0.1274416536092758
Epoch:  18


0.14856570541858674 0.12704505175352096
Epoch:  19


0.1481003612279892 0.13058093935251236
Epoch:  20


0.14726568162441253 0.12679870128631593
Epoch    20: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  21


0.14646096229553224 0.12312636822462082
Epoch:  22


0.14536852598190309 0.12262726426124573
Epoch:  23


0.14525123119354247 0.12224382311105728
Epoch:  24


0.14443651258945464 0.12208190709352493
Epoch:  25


0.14474228858947755 0.12227781862020493
Epoch:  26


0.14419278621673584 0.12201016247272492
Epoch:  27


0.14508259415626526 0.12186703085899353
Epoch:  28


0.1438637226819992 0.12128225117921829
Epoch:  29


0.1426908004283905 0.12168809920549392
Epoch:  30


0.1439245808124542 0.12131559103727341
Epoch:  31


0.1440806919336319 0.12115859240293503
Epoch:  32


0.1430835384130478 0.12154126018285752
Epoch:  33


0.143938068151474 0.12158804088830948
Epoch:  34


0.1432289558649063 0.12158692330121994
Epoch:  35


0.14314581751823424 0.12180556952953339
Epoch:  36


0.1434979557991028 0.12258572429418564
Epoch:  37


0.1432156443595886 0.12129493951797485
Epoch    37: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  38


0.1430964058637619 0.1212145984172821
Epoch:  39


0.1411467671394348 0.12147660255432129
Epoch:  40


0.14131770610809327 0.12125586569309235
Epoch:  41


0.14255111932754516 0.12116895914077759
Epoch:  42


0.1423189228773117 0.12101672142744065
Epoch:  43


0.14238463401794432 0.12121434211730957
Epoch:  44


0.14157565891742707 0.12110403180122375
Epoch:  45


0.1425214546918869 0.12100554555654526
Epoch:  46


0.14269960284233094 0.12106368094682693
Epoch:  47


0.14378181338310242 0.12122481018304825
Epoch:  48


0.14239315688610077 0.12110887616872787
Epoch    48: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  49


0.1420155555009842 0.12106596976518631
Epoch:  50


0.1414213627576828 0.12108524292707443
Epoch:  51


0.14281316101551056 0.12113327533006668
Epoch:  52


0.1421289849281311 0.12106157690286637
Epoch:  53


0.14198180854320527 0.12110100388526916
Epoch:  54


0.14263576984405518 0.12106372863054275
Epoch    54: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  55


0.14166168332099915 0.12097827196121216
Epoch:  56


0.143198322057724 0.12118331342935562
Epoch:  57


0.14186663329601287 0.12130647897720337
Epoch:  58


0.14100729644298554 0.1211940199136734
Epoch:  59


0.14211540937423706 0.12122565805912018
Epoch:  60


0.1432441747188568 0.12103679925203323
Epoch:  61


0.14251294314861299 0.12115868628025055
Epoch    61: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  62


0.14341846466064453 0.12097301334142685
Epoch:  63


0.14147164642810822 0.12102972120046615
Epoch:  64


0.14115179896354677 0.12103486806154251
Epoch:  65


0.14288983821868897 0.12091089636087418
Epoch:  66


0.14297563433647156 0.12100871205329895
Epoch:  67


0.14237824261188506 0.12130583971738815
Epoch:  68


0.14199836373329164 0.12113257050514221
Epoch:  69


0.14207805573940277 0.12107101678848267
Epoch:  70


0.14307428359985352 0.12112340778112411
Epoch:  71


0.14397672951221466 0.12097100615501404
Epoch:  72


0.14279823422431945 0.12106263786554336
Epoch:  73


0.14295883357524872 0.120977483689785
Epoch:  74


0.1424263083934784 0.12121875137090683
Epoch:  75


0.14251450061798096 0.12127135097980499
Epoch:  76


0.1425970596075058 0.12102690041065216
Epoch:  77


0.1426148122549057 0.12106917798519135
Epoch:  78


0.14266129791736604 0.12089024782180786
Epoch:  79


0.14283087909221648 0.12104450911283493
Epoch:  80


0.14266152322292328 0.12092382460832596
Epoch:  81


0.1424948924779892 0.12108231782913208
Epoch:  82


0.14305076777935027 0.12108130306005478
Epoch:  83


0.1420086169242859 0.1210097461938858
Epoch:  84


0.14208195924758912 0.12108815461397171
Epoch:  85


0.14215456545352936 0.12106023877859115
Epoch:  86


0.1429213362932205 0.12092786580324173
Epoch:  87


0.14203933537006377 0.12090294361114502
Epoch:  88


0.1425548642873764 0.12104396671056747
Epoch:  89


0.14283957600593566 0.12105254232883453
Epoch:  90


0.14250276386737823 0.121140156686306
Epoch:  91


0.14205977380275725 0.1211579293012619
Epoch:  92


0.14241408109664916 0.12130933701992035
Epoch:  93
