In [1]:
# Parameters
until_x = 10


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [12]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [13]:
# Instantiate the model
model = Task5Model(31).to(device)

In [14]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6096362753494365 0.47401002049446106
Epoch:  1


0.28020837379468455 0.20228927050318038
Epoch:  2


0.17908339605138107 0.14686971689973558
Epoch:  3


0.16744532214628682 0.14249284671885626
Epoch:  4


0.163797942770494 0.14169030317238399
Epoch:  5


0.16097367816680186 0.14654291101864406
Epoch:  6


0.15962079127092618 0.13489790580102376
Epoch:  7


0.15856017655617483 0.14522702459778106
Epoch:  8


0.15717297349427198 0.1337614038160869
Epoch:  9


0.15663526952266693 0.1343729825956481
Epoch:  10


0.15513762026219755 0.14593649016959326
Epoch:  11


0.15383710410143878 0.12853613815137319
Epoch:  12


0.153718930643958 0.12857905881745474
Epoch:  13


0.1530153135190139 0.13104582471506937
Epoch:  14


0.1527487655749192 0.13419953946556365
Epoch:  15


0.1539796171961604 0.13143703128610337
Epoch:  16


0.15016078707334157 0.1286001897283963
Epoch:  17


0.1515737627003644 0.12790153175592422
Epoch:  18


0.15117495285498128 0.1291368624993733
Epoch:  19


0.15162063853160754 0.12938056673322404
Epoch:  20


0.15137414634227753 0.12587056841169084
Epoch:  21


0.1505094303472622 0.12759305856057576
Epoch:  22


0.1499546234672134 0.12628213422639029
Epoch:  23


0.14912818090335742 0.12875566205808095
Epoch:  24


0.14947992965981766 0.12976925926549093
Epoch:  25


0.14924124930356 0.13040984847715922
Epoch:  26


0.1495871108931464 0.13298729487827846
Epoch    26: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  27


0.14762946238388885 0.12382687734706062
Epoch:  28


0.14667798014911446 0.12344108734812055
Epoch:  29


0.1460822969675064 0.12284397120986666
Epoch:  30


0.14561907705422994 0.12345807360751289
Epoch:  31


0.1458642611632476 0.122772963983672
Epoch:  32


0.14485558263353399 0.12279041005032403
Epoch:  33


0.1445409740145142 0.12279795110225677
Epoch:  34


0.14441417963118167 0.1227214549268995
Epoch:  35


0.1449159170324738 0.12258972227573395
Epoch:  36


0.1450899984385516 0.12316055702311653
Epoch:  37


0.14433364086859934 0.12252948858908244
Epoch:  38


0.14559445590586276 0.12253330967256001
Epoch:  39


0.145016284005062 0.1223072622503553
Epoch:  40


0.14343604888464953 0.12284074085099357
Epoch:  41


0.1453317879019557 0.12253406324556895
Epoch:  42


0.14611028497283524 0.1218650181378637
Epoch:  43


0.1447140695275487 0.12257369607686996
Epoch:  44


0.14404480119009275 0.12286752462387085
Epoch:  45


0.14403445092407433 0.12244609849793571
Epoch:  46


0.14354569485058655 0.12228250503540039
Epoch:  47


0.1447674563607654 0.12265987268515996
Epoch:  48


0.14322512576708923 0.1231830535190446
Epoch    48: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  49


0.14425569772720337 0.12220097439629692
Epoch:  50


0.14423931812917865 0.12217453335012708
Epoch:  51


0.14269733751142347 0.12230748044592994
Epoch:  52


0.1454828417784459 0.12190908512898854
Epoch:  53


0.14459210593958158 0.12242465998445239
Epoch:  54


0.14445106443521138 0.1221514003617423
Epoch    54: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  55


0.14455107337719686 0.12199403984206063
Epoch:  56


0.14216210270250165 0.1223868333867618
Epoch:  57


0.14235716695721085 0.12204474742923464
Epoch:  58


0.1436737131428074 0.12224391741412026
Epoch:  59


0.14346534134568395 0.12207452207803726
Epoch:  60


0.14341249135700432 0.12195159601313728
Epoch    60: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  61


0.1427652461303247 0.12233542331627437
Epoch:  62


0.14235124475247152 0.12203136405774526
Epoch:  63


0.1437156832701451 0.12215491597141538
Epoch:  64


0.14298713086424647 0.12205548052276884
Epoch:  65


0.14314817214334333 0.12225232911961419
Epoch:  66


0.14231118157103256 0.12225989252328873
Epoch    66: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  67
