In [1]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [2]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [3]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [4]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [5]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [6]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [7]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [8]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [9]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [10]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [11]:
until_x = None

In [12]:
# Parameters
until_x = 11


In [13]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [14]:
# Instantiate the model
model = Task5Model(31).to(device)

In [15]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [16]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.63810668603794 0.5191043615341187
Epoch:  1


0.3351627028471715 0.2037735858133861
Epoch:  2


0.19151490040727565 0.1705287652356284
Epoch:  3


0.17908044119138974 0.17642017134598323
Epoch:  4


0.17824498221680923 0.16268547943660192
Epoch:  5


0.17359400117719495 0.16531509586742946
Epoch:  6


0.17277855567030004 0.16519824947629655
Epoch:  7


0.17268160589643428 0.15543702670506068
Epoch:  8


0.17088911621957212 0.1579155453613826
Epoch:  9


0.16916137289356542 0.14991371546472823
Epoch:  10


0.1685037560559608 0.15921747897352492
Epoch:  11


0.16812894714845195 0.1604879753930228
Epoch:  12


0.16595883707742434 0.14721133879252843
Epoch:  13


0.1649164476910153 0.1449035480618477
Epoch:  14


0.16425005849954244 0.14969154553753988
Epoch:  15


0.16430536073607369 0.14093887699501856
Epoch:  16


0.162234510521631 0.1481663350548063
Epoch:  17


0.16238886762309718 0.14154933712312154
Epoch:  18


0.16215067739422256 0.14270573960883276
Epoch:  19


0.16164115876764865 0.1378089530127389
Epoch:  20


0.1614359097706305 0.13651842836822783
Epoch:  21


0.1599676649312715 0.13930788742644445
Epoch:  22


0.15965114574174624 0.1327105537056923
Epoch:  23


0.1579573593429617 0.13141947984695435
Epoch:  24


0.15974880593854027 0.13614475407770701
Epoch:  25


0.15805321206917633 0.13498479553631373
Epoch:  26


0.15815661726771174 0.13808951739753997
Epoch:  27


0.1574564487547488 0.1335601732134819
Epoch:  28


0.1572094668407698 0.1330462938972882
Epoch:  29


0.15622665874055913 0.13506082551819937
Epoch    29: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  30


0.15524924566616882 0.12766762397118978
Epoch:  31


0.15465964336653013 0.12802408529179438
Epoch:  32


0.15387634208073486 0.12714403761284693
Epoch:  33


0.15272504091262817 0.12721080013683864
Epoch:  34


0.15311394269401962 0.1267069439802851
Epoch:  35


0.15383453103336128 0.12739890600953782
Epoch:  36


0.15349644301710902 0.1269113485302244
Epoch:  37


0.15382538494226095 0.1270094279732023
Epoch:  38


0.1530268772228344 0.12715544551610947
Epoch:  39


0.1539981409504607 0.12660244745867594
Epoch:  40


0.15439752106731003 0.12668411433696747
Epoch:  41


0.15284911966001666 0.1264575230223792
Epoch:  42


0.15315324148616274 0.12685901139463698
Epoch:  43


0.15288888481823174 0.12650444677897862
Epoch:  44


0.15225575461580948 0.12653283774852753
Epoch:  45


0.15324988759852745 0.12696597299405507
Epoch:  46


0.15229971143039497 0.1266043164900371
Epoch:  47


0.15420600167802861 0.12624197453260422
Epoch:  48


0.15283664819356557 0.12618702011448996
Epoch:  49


0.15168931113707051 0.1255894973874092
Epoch:  50


0.15229964054919579 0.12667009447302138
Epoch:  51


0.15205083747167844 0.1263810960309846
Epoch:  52


0.1524227285707319 0.12612641496317728
Epoch:  53


0.15185984848318873 0.1264857447573117
Epoch:  54


0.15142205599192027 0.12629010102578572
Epoch:  55


0.1520547754055745 0.12551950024706976
Epoch:  56


0.15165453263231227 0.12717916071414948
Epoch:  57


0.15102843658344164 0.12561230787209102
Epoch:  58


0.15237629131690875 0.12538712258849824
Epoch:  59


0.15188804228563565 0.12570941873959132
Epoch:  60


0.1514086916639998 0.12644127862794058
Epoch:  61


0.15069505129311536 0.12570036947727203
Epoch:  62


0.15138099684908585 0.12619242072105408
Epoch:  63


0.1511050535214914 0.12609999307564326
Epoch:  64


0.15158416451634588 0.1264029017516545
Epoch    64: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  65


0.1509174789125855 0.1251835205725261
Epoch:  66


0.14958791354218046 0.1254188162939889
Epoch:  67


0.15150162094348185 0.12517791028533662
Epoch:  68


0.15081517358083982 0.12503536684172495
Epoch:  69


0.1510526727180223 0.12531529473406927
Epoch:  70


0.15064381062984467 0.1250151821545192
Epoch:  71


0.15024506602738355 0.12512943148612976
Epoch:  72


0.14984994965630608 0.1250181794166565
Epoch:  73


0.15028734464903135 0.1250983742730958
Epoch:  74


0.15061994098328255 0.12490956165960856
Epoch:  75


0.15068584197276347 0.12516731023788452
Epoch:  76


0.15035927859512535 0.1252677866390773
Epoch:  77


0.15135226781303818 0.12516716229064123
Epoch:  78


0.15103932087485855 0.12532611617020198
Epoch:  79


0.15120503386935671 0.12518388139350073
Epoch:  80


0.1509204289397678 0.12537602654525212
Epoch    80: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  81


0.15052446964624766 0.12517474378858293
Epoch:  82


0.15145900080332886 0.12514603244406836
Epoch:  83


0.15021331447201805 0.12514267329658782
Epoch:  84


0.14992461293130308 0.1251129667673792
Epoch:  85


0.14907619840389974 0.12528132327965327
Epoch:  86


0.14985101690163483 0.1252064715538706
Epoch    86: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  87


0.15073772580237002 0.1251295336655208
Epoch:  88


0.14997518666692683 0.1253176714692797
Epoch:  89


0.15116810959738655 0.12549610329525812
Epoch:  90


0.14865997755849683 0.1252698004245758
Epoch:  91


0.15027458603317673 0.12514141201972961
Epoch:  92


0.15129112794592575 0.12521830733333314
Epoch    92: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  93


0.15007035071785385 0.12522962157215392
Epoch:  94


0.1508473467987937 0.12506995456559317
Epoch:  95


0.15198142544643298 0.12512791901826859
Epoch:  96


0.1506041445442148 0.12507222912141255
Epoch:  97


0.15046890442435806 0.12513299499239242
Epoch:  98


0.1504639721400029 0.1250923126935959
Epoch:  99
