In [1]:
# Parameters
until_x = 7


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 96, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 96, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 96, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [12]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [13]:
# Instantiate the model
model = Task5Model(31).to(device)

In [14]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6511731719970704 0.6002566099166871
Epoch:  1


0.42979936361312865 0.2713890731334686
Epoch:  2


0.22327732563018798 0.16912069618701936
Epoch:  3


0.17901551783084868 0.15975319147109984
Epoch:  4


0.1721202951669693 0.15149368047714235
Epoch:  5


0.16794700741767885 0.15511710047721863
Epoch:  6


0.16576677978038787 0.14653074443340303
Epoch:  7


0.16321604669094086 0.15253648161888123
Epoch:  8


0.1624435794353485 0.14615636169910431
Epoch:  9


0.1606662553548813 0.13938708305358888
Epoch:  10


0.1590900057554245 0.14193617403507233
Epoch:  11


0.15923253297805787 0.13534167408943176
Epoch:  12


0.15763229966163636 0.13279148042201996
Epoch:  13


0.15698098301887511 0.13705393373966218
Epoch:  14


0.15723862051963805 0.13525261282920836
Epoch:  15


0.15614357471466064 0.13314179927110673
Epoch:  16


0.15526698231697084 0.13018011152744294
Epoch:  17


0.15484199643135071 0.13366137444972992
Epoch:  18


0.15473998308181763 0.12785518914461136
Epoch:  19


0.15307413399219513 0.13177944272756575
Epoch:  20


0.15487671673297881 0.13215725272893905
Epoch:  21


0.15318335235118866 0.1303679332137108
Epoch:  22


0.15140054106712342 0.13003917038440704
Epoch:  23


0.15213304102420808 0.13025219440460206
Epoch:  24


0.15164029240608215 0.13089981973171233
Epoch    24: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  25


0.15055329620838165 0.1260163202881813
Epoch:  26


0.14866327226161957 0.12454465478658676
Epoch:  27


0.149843190908432 0.12466762661933899
Epoch:  28


0.14947845458984374 0.12523933351039887
Epoch:  29


0.14985927522182466 0.12499327957630157
Epoch:  30


0.14924023866653444 0.12434121817350388
Epoch:  31


0.1492929822206497 0.12455441355705262
Epoch:  32


0.14911116302013397 0.12475568652153016
Epoch:  33


0.1484413594007492 0.12453830093145371
Epoch:  34


0.14756779193878175 0.12520836889743805
Epoch:  35


0.14760852694511414 0.12408793568611146
Epoch:  36


0.14708390951156616 0.1246408462524414
Epoch:  37


0.14879509449005127 0.12426949888467789
Epoch:  38


0.14781892597675322 0.12436014860868454
Epoch:  39


0.14780661702156067 0.12414888143539429
Epoch:  40


0.14785466969013214 0.12415147870779038
Epoch:  41


0.148026961684227 0.12417543828487396
Epoch    41: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  42


0.14834353268146516 0.12424341291189193
Epoch:  43


0.1475753492116928 0.12420083433389664
Epoch:  44


0.14862977385520934 0.12425009459257126
Epoch:  45


0.14606613397598267 0.12403706014156342
Epoch:  46


0.14745236933231354 0.12368759363889695
Epoch:  47


0.1460601681470871 0.12377889454364777
Epoch:  48


0.14654750168323516 0.12348983436822891
Epoch:  49


0.147212153673172 0.12372321337461471
Epoch:  50


0.1469333827495575 0.12381159067153931
Epoch:  51


0.14649743974208831 0.12399354130029679
Epoch:  52


0.14729553520679473 0.1238672375679016
Epoch:  53


0.14716798663139344 0.12372598797082901
Epoch:  54


0.1470862179994583 0.12369599789381028
Epoch    54: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  55


0.14806920707225799 0.12403740733861923
Epoch:  56


0.1478598165512085 0.1237386256456375
Epoch:  57


0.14629335045814515 0.1240051493048668
Epoch:  58


0.14772533476352692 0.12369554042816162
Epoch:  59


0.1468737852573395 0.12390898019075394
Epoch:  60


0.1455996471643448 0.12375659495592117
Epoch    60: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  61


0.146803577542305 0.12406717091798783
Epoch:  62


0.14747348368167879 0.12402300983667373
Epoch:  63


0.14687131464481354 0.12403426766395569
Epoch:  64


0.14794410288333892 0.12387604862451554
Epoch:  65


0.14748318910598754 0.12382148206233978
Epoch:  66


0.14684872329235077 0.12371810674667358
Epoch    66: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  67


0.1479356873035431 0.12392646223306655
Epoch:  68


0.1477985042333603 0.12369044125080109
Epoch:  69


0.1475512945652008 0.1237664669752121
Epoch:  70


0.14542557716369628 0.12401213347911835
Epoch:  71


0.14679715931415557 0.12372201085090637
Epoch:  72


0.1471080219745636 0.12376251071691513
Epoch:  73
