In [1]:
# Parameters
until_x = 14


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [12]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [13]:
# Instantiate the model
model = Task5Model(31).to(device)

In [14]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6084191541414004 0.38207134178706575
Epoch:  1


0.2772210900847976 0.17203808895179204
Epoch:  2


0.1772285450149227 0.16090747926916396
Epoch:  3


0.16698614085042798 0.1609639333827155
Epoch:  4


0.16263078878054749 0.14152585927929198
Epoch:  5


0.16061560205511144 0.1361191485609327
Epoch:  6


0.15733463458112767 0.13379088044166565
Epoch:  7


0.15680152580544754 0.13321742521865026
Epoch:  8


0.15557644214179064 0.13224486580916814
Epoch:  9


0.15278659479038134 0.13079657405614853
Epoch:  10


0.15427752201621597 0.13164112078292028
Epoch:  11


0.15248457483343175 0.13085389882326126
Epoch:  12


0.15322860592120402 0.13062263280153275
Epoch:  13


0.15223665374356346 0.12684680202177592
Epoch:  14


0.15277338752875458 0.13139716748680388
Epoch:  15


0.15159938222653158 0.1321664090667452
Epoch:  16


0.1500879139513583 0.13229716888495854
Epoch:  17


0.15057561848614667 0.12836874489273345
Epoch:  18


0.14987331628799438 0.1258450129202434
Epoch:  19


0.14823306855317708 0.12645865231752396
Epoch:  20


0.1492911440295142 0.126420120043414
Epoch:  21


0.15023227640100428 0.1284375467470714
Epoch:  22


0.1480976737834312 0.13049522361585073
Epoch:  23


0.14844063085478706 0.12660463260752813
Epoch:  24


0.14744392238758705 0.12417479817356382
Epoch:  25


0.1480390409360061 0.12897398535694396
Epoch:  26


0.14761688379017082 0.1291752510837146
Epoch:  27


0.14847125676838127 0.12428468146494456
Epoch:  28


0.14681248366832733 0.12576171117169516
Epoch:  29


0.14608656715702367 0.12587774332080567
Epoch:  30


0.14632939446616816 0.12888272000210627
Epoch    30: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  31


0.14468418786654602 0.12156542497021812
Epoch:  32


0.14401951190587636 0.12115043933902468
Epoch:  33


0.14350011904497403 0.12143230544669288
Epoch:  34


0.14288908202905912 0.12111245202166694
Epoch:  35


0.14346274773816806 0.1208742665393012
Epoch:  36


0.14274875537769213 0.12128343539578575
Epoch:  37


0.142856739662789 0.1210804432630539
Epoch:  38


0.14240506814943776 0.12120158757482256
Epoch:  39


0.1429748124367482 0.12093208410910197
Epoch:  40


0.14084790163748973 0.12027878952877862
Epoch:  41


0.1417277044541127 0.12079762773854393
Epoch:  42


0.14051028522285255 0.12099377704518181
Epoch:  43


0.14142766997620865 0.12082735449075699
Epoch:  44


0.14080247484348915 0.12107184742178236
Epoch:  45


0.14099726161441287 0.1207554702247892
Epoch:  46


0.14048681508850408 0.12110035121440887
Epoch    46: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  47


0.14118000502521927 0.12079245809997831
Epoch:  48


0.1416834149811719 0.12079010690961565
Epoch:  49


0.1418595579830376 0.1209907957485744
Epoch:  50


0.14114040700165 0.12045259454420634
Epoch:  51


0.14113078487885966 0.12064717497144427
Epoch:  52


0.14108822394061732 0.12064726118530546
Epoch    52: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  53


0.14144445633566058 0.1206645518541336
Epoch:  54


0.14176849457057747 0.12089870125055313
Epoch:  55


0.14041608008178505 0.1206850813967841
Epoch:  56


0.14105641922435244 0.12091606536081859
Epoch:  57


0.14038674090359662 0.12073253840208054
Epoch:  58


0.14100000423354073 0.12062089038746697
Epoch    58: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  59


0.14045682187015945 0.12074166962078639
Epoch:  60


0.14065580553299672 0.12073905340262822
Epoch:  61


0.14065223167071472 0.12080727091857366
Epoch:  62


0.14097364287118655 0.12077506418739047
Epoch:  63


0.1418079580809619 0.12076971999236516
Epoch:  64


0.1413408533946888 0.12073533236980438
Epoch    64: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  65
