In [1]:
# Parameters
until_x = 4


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 96, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 96, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 96, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [12]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [13]:
# Instantiate the model
model = Task5Model(31).to(device)

In [14]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.653850519657135 0.585361123085022
Epoch:  1


0.4288037824630737 0.3450699210166931
Epoch:  2


0.2239256274700165 0.19856613874435425
Epoch:  3


0.17899786591529845 0.39118456840515137
Epoch:  4


0.17177840411663056 0.17885486781597137
Epoch:  5


0.16846726417541505 0.16149218678474425
Epoch:  6


0.1647987598180771 0.14936836063861847
Epoch:  7


0.1621914631128311 0.13989486992359162
Epoch:  8


0.16063408315181732 0.18668076097965242
Epoch:  9


0.1600942486524582 0.1512337625026703
Epoch:  10


0.15949684143066406 0.14010317623615265
Epoch:  11


0.15847938120365143 0.13274382054805756
Epoch:  12


0.15671684443950654 0.14903663396835326
Epoch:  13


0.15617213189601897 0.1324704721570015
Epoch:  14


0.15611681282520296 0.1338280200958252
Epoch:  15


0.15530873835086823 0.13201891928911208
Epoch:  16


0.15654786050319672 0.13668967336416243
Epoch:  17


0.1551394546031952 0.13635775744915007
Epoch:  18


0.15263509392738342 0.13748324066400527
Epoch:  19


0.15355128943920135 0.13739302456378938
Epoch:  20


0.15545937955379485 0.12850077003240584
Epoch:  21


0.15366830468177795 0.1305695578455925
Epoch:  22


0.1524984645843506 0.13002716153860092
Epoch:  23


0.1526203918457031 0.13532809168100357
Epoch:  24


0.15276255309581757 0.13617767095565797
Epoch:  25


0.15377183139324188 0.2740216702222824
Epoch:  26


0.1518973970413208 0.13182761371135712
Epoch    26: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  27


0.1508004856109619 0.12452836781740188
Epoch:  28


0.15006855905056 0.12469348013401031
Epoch:  29


0.15042248189449312 0.12394602298736572
Epoch:  30


0.14935690701007842 0.12410571873188019
Epoch:  31


0.14907811641693114 0.12403108775615693
Epoch:  32


0.14886242866516114 0.12380450367927551
Epoch:  33


0.14958449304103852 0.12432715743780136
Epoch:  34


0.14897414565086364 0.12449576407670974
Epoch:  35


0.1494343864917755 0.12411306202411651
Epoch:  36


0.1479106903076172 0.12381625026464463
Epoch:  37


0.1497119140625 0.12429295778274536
Epoch:  38


0.14862066745758057 0.1239520013332367
Epoch    38: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  39


0.14925233483314515 0.12395685464143753
Epoch:  40


0.14867523312568665 0.1238553524017334
Epoch:  41


0.1483219486474991 0.12373759150505066
Epoch:  42


0.14759283244609833 0.12369249910116195
Epoch:  43


0.14803692519664766 0.123643559217453
Epoch:  44


0.1481657999753952 0.12373783141374588
Epoch:  45


0.14732683181762696 0.12365145236253738
Epoch:  46


0.14792787075042724 0.12381552010774613
Epoch:  47


0.14794401466846466 0.12381547540426255
Epoch:  48


0.1483948200941086 0.12399228066205978
Epoch:  49


0.14769354104995727 0.12380296587944031
Epoch    49: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  50


0.148668874502182 0.1237364023923874
Epoch:  51


0.1499124699831009 0.1235743671655655
Epoch:  52


0.14845259249210357 0.12390296459197998
Epoch:  53


0.14902155935764314 0.12359759509563446
Epoch:  54


0.14810706973075866 0.12348006069660186
Epoch:  55


0.1473306769132614 0.12374835312366486
Epoch:  56


0.14796459436416626 0.12385542988777161
Epoch:  57


0.1481200075149536 0.12344386130571365
Epoch:  58


0.1483323496580124 0.1237367570400238
Epoch:  59


0.1490141248703003 0.12358857095241546
Epoch:  60


0.1489257913827896 0.12362003028392791
Epoch:  61


0.1469330930709839 0.12400861531496048
Epoch:  62


0.14865743100643158 0.12379949241876602
Epoch:  63


0.14831782937049864 0.12360007464885711
Epoch    63: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  64


0.14761961340904237 0.1238020807504654
Epoch:  65


0.147336984872818 0.12379524856805801
Epoch:  66


0.1490596926212311 0.12355473935604096
Epoch:  67


0.14800308108329774 0.12381079941987991
Epoch:  68


0.14888718366622924 0.12333572953939438
Epoch:  69


0.14770669043064116 0.12359302192926407
Epoch:  70


0.14798442959785463 0.1238169863820076
Epoch:  71


0.14748303294181825 0.12380491048097611
Epoch:  72


0.14791083574295044 0.1237290307879448
Epoch:  73


0.14659315586090088 0.12376239597797394
Epoch:  74


0.1491006428003311 0.12357758581638337
Epoch    74: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  75


0.1474146580696106 0.12365342080593109
Epoch:  76


0.14863528072834015 0.12362352758646011
Epoch:  77


0.1486202472448349 0.12380929291248322
Epoch:  78


0.14770301938056946 0.12381751537322998
Epoch:  79


0.14801931321620942 0.1236047938466072
Epoch:  80


0.1487485373020172 0.12369859516620636
Epoch:  81


0.147544487118721 0.12371816784143448
Epoch:  82


0.14794391214847566 0.1239153191447258
Epoch:  83


0.14762862801551818 0.12355400919914246
Epoch:  84


0.1490344899892807 0.12369434982538223
Epoch:  85


0.14706662714481353 0.12359657883644104
Epoch:  86


0.1483718228340149 0.12372323423624039
Epoch:  87


0.14804384171962737 0.12391646355390548
Epoch:  88


0.14865842163562776 0.12367230057716369
Epoch:  89


0.1478837925195694 0.12348672151565551
Epoch:  90


0.14744935274124146 0.12368467748165131
Epoch:  91


0.1485310798883438 0.12355136424303055
Epoch:  92


0.14888051211833953 0.12382432222366332
Epoch:  93
