In [1]:
# Parameters
until_x = 15


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [12]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [13]:
# Instantiate the model
model = Task5Model(31).to(device)

In [14]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.606626910937799 0.4100178267274584
Epoch:  1


0.27629579362031575 0.16461009638650076
Epoch:  2


0.17384500962656899 0.18002957744257791
Epoch:  3


0.1628050349048666 0.13694277618612563
Epoch:  4


0.15901173047117284 0.13702325842210225
Epoch:  5


0.15726475095426715 0.13282553851604462
Epoch:  6


0.15506442012013616 0.13461431435176305
Epoch:  7


0.1542446573038359 0.1338561890380723
Epoch:  8


0.1529688508929433 0.13551062451941626
Epoch:  9


0.15349317684366898 0.1302121803164482
Epoch:  10


0.15273847612174782 0.1271104695541518
Epoch:  11


0.15303023320597572 0.12961710670164653
Epoch:  12


0.15182827654722575 0.12961567299706594
Epoch:  13


0.1515267201372095 0.1282099751489503
Epoch:  14


0.15090525915493835 0.1283696730222021
Epoch:  15


0.149702451519064 0.12925776945693151
Epoch:  16


0.15033794214596619 0.1268181215439524
Epoch:  17


0.1507657182377738 0.1275082262498992
Epoch:  18


0.14907666597817396 0.13321167124169214
Epoch:  19


0.14851687123646606 0.12777050690991537
Epoch:  20


0.14925025363226194 0.128233545592853
Epoch:  21


0.1475624289061572 0.13101039507559367
Epoch:  22


0.1479494418646838 0.12533416705472128
Epoch:  23


0.14663170398892583 0.12412750614540917
Epoch:  24


0.14732554192478592 0.12433825326817376
Epoch:  25


0.14749713564241254 0.1282510065606662
Epoch:  26


0.1473213297289771 0.1271126280937876
Epoch:  27


0.146465362729253 0.12514101607458933
Epoch:  28


0.14773031666472153 0.1270258958850588
Epoch:  29


0.14556540750168465 0.12534313755375998
Epoch    29: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  30


0.14543059911276843 0.12114707167659487
Epoch:  31


0.1443609832911878 0.12087870389223099
Epoch:  32


0.14254481285005002 0.12088341691664287
Epoch:  33


0.14313733819368724 0.12029578856059484
Epoch:  34


0.14358276813416868 0.12003075437886375
Epoch:  35


0.14324654679040652 0.12026514112949371
Epoch:  36


0.14224827692315384 0.12008622288703918
Epoch:  37


0.1419259188948451 0.12013909859316689
Epoch:  38


0.14199652704032692 0.12043579454932894
Epoch:  39


0.14258144634800987 0.12009995856455394
Epoch:  40


0.1420634034517649 0.11988958716392517
Epoch:  41


0.14121512182661006 0.11991085005658013
Epoch:  42


0.14140393041275642 0.12008685405765261
Epoch:  43


0.14215429852137695 0.12059073895215988
Epoch:  44


0.14044104154045517 0.12022988072463445
Epoch:  45


0.14136910196897146 0.12145821217979703
Epoch:  46


0.14016284310334437 0.12098891926663262
Epoch    46: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  47


0.1400597888875652 0.12073175076927457
Epoch:  48


0.14129914987731623 0.12066229007073812
Epoch:  49


0.14036895737454697 0.12066391961915153
Epoch:  50


0.14002728260852196 0.12051504850387573
Epoch:  51


0.1404895109904779 0.12074867848839078
Epoch:  52


0.14126771507230965 0.12056306004524231
Epoch    52: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  53


0.14050935450437907 0.12040091944592339
Epoch:  54


0.14005347280888944 0.12065253513199943
Epoch:  55


0.14103932598152677 0.12052146026066371
Epoch:  56


0.14039548466334473 0.12051479092666081
Epoch:  57


0.14012932253850474 0.12057424655982427
Epoch:  58


0.14026828110218048 0.12051937516246523
Epoch    58: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  59


0.14090107059156573 0.12047347000667027
Epoch:  60


0.14014571623222247 0.120492123067379
Epoch:  61


0.14085764941331502 0.12058574387005397
Epoch:  62


0.13985228860700452 0.12046336595501218
Epoch:  63


0.14071560308739944 0.1206117325595447
Epoch:  64


0.1407886322688412 0.1204279756971768
Epoch    64: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  65
