In [1]:
# Parameters
until_x = 15


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [12]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [13]:
# Instantiate the model
model = Task5Model(31).to(device)

In [14]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6073506168417029 0.4456362085683005
Epoch:  1


0.2722289167545937 0.1487909502216748
Epoch:  2


0.1708798992472726 0.14682404377630778
Epoch:  3


0.16067354300537626 0.13881509325334004
Epoch:  4


0.15628709946129773 0.13417382112571172
Epoch:  5


0.15643948276300687 0.1338666815842901
Epoch:  6


0.15435360734527176 0.15042200897421157
Epoch:  7


0.15293022466672435 0.13252710976770946
Epoch:  8


0.1523188002206184 0.13093432358333043
Epoch:  9


0.15139296087058815 0.12970217955963953
Epoch:  10


0.15252438791700312 0.13000632396766118
Epoch:  11


0.15156618082845533 0.13062877633741923
Epoch:  12


0.1511036077061215 0.12828280350991658
Epoch:  13


0.15045211564850164 0.12883196026086807
Epoch:  14


0.1494287941101435 0.1297958248427936
Epoch:  15


0.14960433381634788 0.13651760135378158
Epoch:  16


0.14920968179767197 0.13159769560609544
Epoch:  17


0.1499263289812449 0.1274798246366637
Epoch:  18


0.14804872427437757 0.130231679550239
Epoch:  19


0.14815998198212804 0.12511513914380754
Epoch:  20


0.1487860087607358 0.12375355511903763
Epoch:  21


0.1480176315919773 0.12461153524262565
Epoch:  22


0.14705024941547498 0.12685990972178324
Epoch:  23


0.14730681841437882 0.1268826276063919
Epoch:  24


0.14629582335820068 0.13157956727913447
Epoch:  25


0.1480923709837166 0.12571800287280763
Epoch:  26


0.14611815157774333 0.12751388549804688
Epoch    26: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  27


0.14494608342647552 0.12291825988462993
Epoch:  28


0.14280546839172775 0.12354085913726262
Epoch:  29


0.1424709779990686 0.12238263125930514
Epoch:  30


0.14307115609581406 0.1221563730921064
Epoch:  31


0.1433705457964459 0.12117101039205279
Epoch:  32


0.14197086562981476 0.1213083490729332
Epoch:  33


0.14360469619970065 0.12205118898834501
Epoch:  34


0.14200811369999036 0.12201308459043503
Epoch:  35


0.14258732767523946 0.12128328957727977
Epoch:  36


0.14177114516496658 0.12149501804794584
Epoch:  37


0.14339207153062564 0.12066124379634857
Epoch:  38


0.1419062288226308 0.12102322067533221
Epoch:  39


0.1414928025490529 0.12108811523233141
Epoch:  40


0.14195745015466535 0.12159678765705653
Epoch:  41


0.1417716379101212 0.1216290390917233
Epoch:  42


0.14150855307643478 0.1215340473822185
Epoch:  43


0.14162709745200905 0.12124091386795044
Epoch    43: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  44


0.1413759432128958 0.12099750978606087
Epoch:  45


0.1407564089104936 0.12093799135514668
Epoch:  46


0.1397570419955898 0.12097471313817161
Epoch:  47


0.14146596072493373 0.12084368935653142
Epoch:  48


0.14006605905455513 0.12091734153883797
Epoch:  49


0.13971882978001157 0.12093215329306466
Epoch    49: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  50


0.14017265008108035 0.12067944875785283
Epoch:  51


0.13954424374812358 0.12057360687426158
Epoch:  52


0.1404846397203368 0.12060870549508504
Epoch:  53


0.1397544830232053 0.12081725256783622
Epoch:  54


0.1411762539599393 0.12060541659593582
Epoch:  55


0.13952705747372396 0.12068448215723038
Epoch:  56


0.14119072258472443 0.12066612924848284
Epoch:  57


0.1413946896791458 0.12090806769473213
Epoch    57: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  58


0.1397427247988211 0.12078579940966197
Epoch:  59


0.14045445782107277 0.12060704082250595
Epoch:  60


0.14085785440496496 0.12073238087551934
Epoch:  61


0.13976495451218374 0.12063578516244888
Epoch:  62


0.14079292440736615 0.12088683460439954
Epoch:  63


0.14122923121259018 0.12083776188748223
Epoch    63: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  64


0.14029140730161924 0.1207184014575822
Epoch:  65


0.14093553496373667 0.12085067267928805
Epoch:  66


0.141101290648048 0.12069055650915418
Epoch:  67


0.14034498503079285 0.12097835327897753
Epoch:  68


0.14034560644948804 0.12055192568472453
Epoch:  69


0.14190374516152046 0.12078644548143659
Epoch:  70


0.13979872497352394 0.12090194118874413
Epoch:  71


0.14160622092517647 0.12078164943626948
Epoch:  72


0.14022875677894903 0.12077152941908155
Epoch:  73


0.1401396248791669 0.12061706504651479
Epoch:  74


0.1408635728262566 0.12081065667527062
Epoch:  75


0.14107284996960615 0.12093974330595561
Epoch:  76


0.1404042026481113 0.12097127416304179
Epoch:  77


0.1401322798149006 0.12064943036862782
Epoch:  78


0.1404283288362864 0.12079387903213501
Epoch:  79


0.1401933882687543 0.12083523507629122
Epoch:  80


0.14081234545321078 0.1205826923251152
Epoch:  81


0.1407706278401452 0.12066869331257683
Epoch:  82


0.13927931237865138 0.12081999544586454
Epoch:  83


0.139883825505102 0.12111566854374749
Epoch:  84


0.14058715105056763 0.12058396850313459
Epoch:  85


0.13978112267481313 0.12089459385190691
Epoch:  86


0.14042344649095792 0.12067969675574984
Epoch:  87


0.14169928753698194 0.12078177609613963
Epoch:  88


0.14031678760373914 0.12064911744424275
Epoch:  89


0.14157993024265444 0.1206268583025251
Epoch:  90


0.1407703735135697 0.12062708075557436
Epoch:  91


0.14044584092256185 0.12090959080627986
Epoch:  92


0.140082425362355 0.12073737702199391
Epoch:  93
