In [1]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor



In [2]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [3]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [4]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [5]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [6]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [7]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [8]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [9]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [10]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [11]:
until_x = None

In [12]:
# Parameters
until_x = 2


In [13]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [14]:
# Instantiate the model
model = Task5Model(31).to(device)

In [15]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [16]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6377757046673749 0.49145046302250456
Epoch:  1


0.3298776254460618 0.19726291298866272
Epoch:  2


0.18507148768450762 0.1597285270690918
Epoch:  3


0.17206596562991272 0.15211216998951776
Epoch:  4


0.16666093508939486 0.1499003725392478
Epoch:  5


0.16428497071201736 0.14495449300323213
Epoch:  6


0.15948092252821536 0.1389262324997357
Epoch:  7


0.15929960600427678 0.1412005073257855
Epoch:  8


0.15880606222797083 0.13534292791570937
Epoch:  9


0.15721593555566427 0.13723037179027284
Epoch:  10


0.15687582541156458 0.13127422971384867
Epoch:  11


0.1566718115999892 0.13328896250043595
Epoch:  12


0.15524189214448672 0.13597840070724487
Epoch:  13


0.15674019262597366 0.13055151594536646
Epoch:  14


0.1548834886099841 0.13000040182045527
Epoch:  15


0.15335336569193248 0.13059736788272858
Epoch:  16


0.15409824413222237 0.1277082934975624
Epoch:  17


0.15963962392227068 0.13526221151862824
Epoch:  18


0.1581893418286298 0.13372592947312764
Epoch:  19


0.1558748483657837 0.1332577413746289
Epoch:  20


0.15395525863041748 0.12991364406687872
Epoch:  21


0.15437179280293956 0.12903665325471333
Epoch:  22


0.15474902576691396 0.12675308542592184
Epoch:  23


0.15291013790143504 0.12815584987401962
Epoch:  24


0.15214628303373182 0.12711155308144434
Epoch:  25


0.15140865179332527 0.12732314957039698
Epoch:  26


0.15190734653859525 0.12637882360390254
Epoch:  27


0.15155236785476273 0.13008887746504375
Epoch:  28


0.15122224712694013 0.12852323055267334
Epoch:  29


0.15112307385818377 0.12859294137784413
Epoch:  30


0.15001162443612073 0.12876259641987936
Epoch:  31


0.14970010076020215 0.12660660701138632
Epoch:  32


0.14923923724406474 0.12511922419071198
Epoch:  33


0.14945302098184018 0.1250085766826357
Epoch:  34


0.1490868068224675 0.12614919138806208
Epoch:  35


0.14893967034043493 0.12822004514081137
Epoch:  36


0.1508961380333514 0.12787755791630065
Epoch:  37


0.14889930430296305 0.12671492674521037
Epoch:  38


0.14847163211654973 0.1267599825348173
Epoch:  39


0.14790247501553716 0.12665306138140814
Epoch    39: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  40


0.14764175624460787 0.12313102398599897
Epoch:  41


0.145798100410281 0.12235637647765023
Epoch:  42


0.14574308894776009 0.12183335423469543
Epoch:  43


0.14535533093117378 0.12174916373831886
Epoch:  44


0.14430766895010666 0.12120161737714495
Epoch:  45


0.14464814236035217 0.12116996837513787
Epoch:  46


0.14544130016017603 0.12119870313576289
Epoch:  47


0.1444084070421554 0.12118610739707947
Epoch:  48


0.1439348418970366 0.12167353928089142
Epoch:  49


0.1455315029298937 0.12088347226381302
Epoch:  50


0.14448734715178208 0.1208948524934905
Epoch:  51


0.14453412874324903 0.12130729428359441
Epoch:  52


0.14492843924341975 0.12115540568317686
Epoch:  53


0.14448345633777412 0.12058591736214501
Epoch:  54


0.14601994688446457 0.12162610569170543
Epoch:  55


0.1427370072216601 0.12171424818890435
Epoch:  56


0.14359295811202075 0.12170234216111046
Epoch:  57


0.14277506035727425 0.12138199486902781
Epoch:  58


0.14387164446147713 0.12126929845128741
Epoch:  59


0.14253553969634547 0.12158954995019096
Epoch    59: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  60


0.1441781424187325 0.12112516164779663
Epoch:  61


0.14204269043497136 0.12117735828672137
Epoch:  62


0.14394707333397222 0.1210748138172286
Epoch:  63


0.14242940174566732 0.12109197676181793
Epoch:  64


0.14372238033526652 0.12101584460054125
Epoch:  65


0.1430826537512444 0.12133078915732247
Epoch    65: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  66


0.14368501587494 0.1210708458508764
Epoch:  67


0.14181423428896311 0.12101056746074132
Epoch:  68


0.14287592148458636 0.12110290676355362
Epoch:  69


0.14274964421182065 0.12121865046875817
Epoch:  70


0.1436341433106242 0.12107256587062563
Epoch:  71


0.14153858697092211 0.12105757210935865
Epoch    71: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  72


0.14221098576043104 0.12106001057795115
Epoch:  73


0.14221102079829653 0.12101612027202334
Epoch:  74


0.1425994010390462 0.12099641135760716
Epoch:  75


0.14252805146011147 0.12149067223072052
Epoch:  76


0.14403011951897596 0.12126288563013077
Epoch:  77


0.14354328931988897 0.12125367032630104
Epoch    77: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  78
