In [1]:
# Parameters
until_x = 0


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [12]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [13]:
# Instantiate the model
model = Task5Model(31).to(device)

Downloading: "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth" to /root/.cache/torch/checkpoints/mobilenet_v2-b0353104.pth


  0%|                                                                                                                                                                       | 0/14212972 [00:00<?, ?it/s]

  0%|▍                                                                                                                                                      | 40960/14212972 [00:00<00:48, 293951.85it/s]

  2%|██▌                                                                                                                                                   | 245760/14212972 [00:00<00:36, 384987.83it/s]

  7%|███████████▏                                                                                                                                         | 1064960/14212972 [00:00<00:24, 534073.63it/s]

 28%|█████████████████████████████████████████▊                                                                                                           | 3989504/14212972 [00:00<00:13, 755533.99it/s]

 46%|███████████████████████████████████████████████████████████████████▉                                                                                | 6529024/14212972 [00:00<00:07, 1065279.72it/s]

 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                               | 11157504/14212972 [00:00<00:02, 1506960.64it/s]

 98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌   | 13885440/14212972 [00:00<00:00, 2086538.96it/s]

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 14212972/14212972 [00:00<00:00, 15582241.14it/s]




In [14]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.5831629110349191 63.84307207380022
Epoch:  1


0.2571403307689203 551.6229901994977
Epoch:  2


0.19404492346016136 353.31925310407365
Epoch:  3


0.1895472302630141 266.8137534005301
Epoch:  4


0.1886818723098652 41151.01060267857
Epoch:  5


0.18876365189616745 4193.09619140625
Epoch:  6


0.18825671640602318 2299.299577985491
Epoch     6: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  7


0.18799095862620585 1717.720982142857
Epoch:  8


0.18753168752064575 3062.3514578683034
Epoch:  9


0.18810408743652138 1113.091517857143
Epoch:  10


0.1866807152290602 1105.180428641183
Epoch:  11


0.18770975196683728 773.5536411830357
Epoch:  12


0.18787200023999084 1386.5740269252233
Epoch    12: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  13


0.18904922419303172 106.80447496686664
Epoch:  14


0.18793386220932007 303.14847237723217
Epoch:  15


0.18707078252289747 45.77316066196987
Epoch:  16


0.18731404397938703 192.94881984165735
Epoch:  17


0.18784142668182785 7.713644436427525
Epoch:  18


0.18778454492221008 15.889357839311872
Epoch:  19


0.18769186332419113 120.08527592250279
Epoch:  20


0.1871579740498517 10.113347462245397
Epoch:  21


0.18762539568785075 277.98585292271207
Epoch:  22


0.18750106442619013 101.30237688337054
Epoch:  23


0.18798459462217382 7.561912400381906
Epoch:  24


0.18770814827970556 9.0887268611363
Epoch:  25


0.18735570118233963 103.61642783028739
Epoch:  26


0.1882789195389361 120.17806352887835
Epoch:  27


0.18778250547679695 16.744652475629533
Epoch:  28


0.18739348811072273 86.24304635184151
Epoch:  29


0.18752734161712029 223.6339329310826
Epoch    29: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  30


0.18649192557141586 8.657527038029261
Epoch:  31


0.18758485123917862 7.024881635393415
Epoch:  32


0.1882485727200637 7.476493631090436
Epoch:  33


0.18741167598479502 17.434424672807967
Epoch:  34


0.18677735852228627 14.893981661115374
Epoch:  35


0.1875725553647892 6.251382214682443
Epoch:  36


0.1877885033955445 7.887150151388986
Epoch:  37


0.18769620278397123 12.089326858520508
Epoch:  38


0.1886890816527444 5.356546810695103
Epoch:  39


0.18743911425809603 7.997414248330252
Epoch:  40


0.18716467916965485 5.718304565974644
Epoch:  41


0.1873655709865931 12.644070761544365
Epoch:  42


0.1873717142923458 11.91541508265904
Epoch:  43


0.18747612592336294 45.18550109863281
Epoch:  44


0.1873292741743294 2.8290841238839284
Epoch:  45


0.18763128043832006 1.3999504021235876
Epoch:  46


0.18652665091527476 28.055421556745255
Epoch:  47


0.1876954730298068 8.935466630118233
Epoch:  48


0.18804578805291974 19.82874025617327
Epoch:  49


0.1885330495802132 8.269960948399135
Epoch:  50


0.1882461228886166 10.099183900015694
Epoch:  51


0.18741791594672846 7.861943926130023
Epoch    51: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  52


0.18757518481563878 12.786739349365234
Epoch:  53


0.1883284143499426 10.049558639526367
Epoch:  54


0.18787430226802826 11.229004042489189
Epoch:  55


0.18879303859697805 12.709126608712333
Epoch:  56


0.18757228794935588 12.533792223249163
Epoch:  57


0.18832942279609474 12.517405918666295
Epoch    57: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  58


0.1880762750232542 12.462032181876046
Epoch:  59


0.1868653667939676 12.402908188956124
Epoch:  60


0.18842988118932052 12.459996359688896
Epoch:  61


0.18780677906564763 12.512757846287318
Epoch:  62


0.18801925029303576 12.456884792872838
Epoch:  63


0.18793345383695653 12.470732280186244
Epoch:  64


0.18760944137702118 12.490167890276227
Epoch:  65


0.18731670403802717 12.450633866446358
Epoch:  66


0.18827450275421143 12.438731057303292
Epoch:  67


0.18713881035108823 12.50133582523891
Epoch:  68


0.18737198412418365 12.50813661302839
Epoch:  69


0.18830085404821345 12.494600704738072
Epoch:  70
