In [1]:
# Parameters
until_x = 5


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]


def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()


class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 96, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 96, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 96, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
# Instantiate the model
model = Task5Model(31).to(device)

In [12]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [13]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 15:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6479861688613892 0.5857261538505554
Epoch:  1


0.42225800395011903 0.30971195101737975
Epoch:  2


0.22189926207065583 0.20552217662334443
Epoch:  3


0.18100889265537262 0.17918800413608552
Epoch:  4


0.17407806277275084 0.15321333706378937
Epoch:  5


0.17081126987934112 0.1506169080734253
Epoch:  6


0.16828177809715272 0.1635696053504944
Epoch:  7


0.16479565262794493 0.15263510048389434
Epoch:  8


0.1634282499551773 0.1428681045770645
Epoch:  9


0.16189670383930208 0.14047684669494628
Epoch:  10


0.15836522459983826 0.13590880930423738
Epoch:  11


0.15790004253387452 0.1361502915620804
Epoch:  12


0.1587691968679428 0.13694075793027877
Epoch:  13


0.1577102029323578 0.13235060274600982
Epoch:  14


0.15719619929790496 0.13419106006622314
Epoch:  15


0.15614086806774138 0.1317552551627159
Epoch:  16


0.15653065383434295 0.1297072246670723
Epoch:  17


0.1553116649389267 0.1304088443517685
Epoch:  18


0.1549883556365967 0.13735425770282744
Epoch:  19


0.1533546632528305 0.13147223144769668
Epoch:  20


0.15406088709831237 0.13012093007564546
Epoch:  21


0.15230069279670716 0.13234166353940963
Epoch:  22


0.15368835091590882 0.13060699999332429
Epoch    22: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  23


0.1517753791809082 0.1261948138475418
Epoch:  24


0.150630801320076 0.1262853115797043
Epoch:  25


0.150511434674263 0.12623063921928407
Epoch:  26


0.15133836805820466 0.12593544125556946
Epoch:  27


0.15002407550811767 0.12559937089681625
Epoch:  28


0.1496964681148529 0.125702603161335
Epoch:  29


0.14813349545001983 0.12568160593509675
Epoch:  30


0.14968105852603913 0.12571418434381484
Epoch:  31


0.15012841582298278 0.1254195675253868
Epoch:  32


0.14894905388355256 0.12511327266693115
Epoch:  33


0.14988261163234712 0.12516999691724778
Epoch:  34


0.14920146465301515 0.12503256946802138
Epoch:  35


0.15030622661113738 0.12517281025648117
Epoch:  36


0.1488976114988327 0.12456297129392624
Epoch:  37


0.1508578610420227 0.12511838972568512
Epoch:  38


0.1493328845500946 0.12436908334493638
Epoch:  39


0.14889744341373443 0.12489637732505798
Epoch:  40


0.14759678721427918 0.12467708289623261
Epoch:  41


0.14795769155025482 0.12462005764245987
Epoch:  42


0.1493579226732254 0.12442369908094406
Epoch:  43


0.1495848649740219 0.12471742033958436
Epoch:  44


0.14848875761032104 0.12466071993112564
Epoch    44: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  45


0.14874105632305146 0.12403883934020996
Epoch:  46


0.14910492777824402 0.12440517842769623
Epoch:  47


0.14866424322128297 0.12430122494697571
Epoch:  48


0.1484961348772049 0.12409253567457199
Epoch:  49


0.1472505486011505 0.12423990517854691
Epoch:  50


0.14741704404354095 0.12428531348705292
Epoch:  51


0.1483214670419693 0.12391575425863266
Epoch:  52


0.14877070665359496 0.12413939833641052
Epoch:  53


0.14815268397331238 0.12404310256242752
Epoch:  54


0.14750967025756836 0.1240580290555954
Epoch:  55


0.14913398146629334 0.12426597476005555
Epoch:  56


0.14807770371437073 0.12422892898321151
Epoch:  57


0.14836763739585876 0.12399979084730148
Epoch    57: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  58


0.148561315536499 0.12409494668245316
Epoch:  59


0.14827013731002808 0.12388352006673813
Epoch:  60


0.1491957587003708 0.12411468178033828
Epoch:  61


0.14814292907714843 0.12420123666524888
Epoch:  62


0.1467887908220291 0.1241816148161888
Epoch:  63


0.14826898217201234 0.12395634055137635
Epoch:  64


0.14724128425121308 0.12413421422243118
Epoch:  65


0.14803315699100494 0.12401920855045319
Epoch    65: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  66


0.14758426427841187 0.124123977124691
Epoch:  67


0.14756097853183747 0.12410131245851516
Epoch:  68


0.14819107234477996 0.12387593388557434
Epoch:  69


0.1485196226835251 0.1241452693939209
Epoch:  70


0.14795732855796814 0.12394188493490219
Epoch:  71


0.1489190125465393 0.12397350668907166
Epoch    71: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  72


0.1479814875125885 0.12403663843870164
Epoch:  73


0.1479694139957428 0.1240045502781868
Epoch:  74


0.14927838683128358 0.12400919497013092
Epoch:  75


0.14802710294723512 0.12390625774860382
Epoch:  76


0.1490772271156311 0.12399114072322845
Epoch:  77


0.14811659693717957 0.12419783771038055
Epoch:  78


0.14664296686649322 0.12425036430358886
Epoch:  79


0.14806326985359192 0.1239882230758667
Epoch:  80


0.14892515122890473 0.12392936199903488
Epoch:  81


0.14804914474487305 0.12421486675739288
Epoch:  82


0.1481146490573883 0.1242736741900444
Epoch:  83
