In [1]:
# Parameters
until_x = 18


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [12]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [13]:
# Instantiate the model
model = Task5Model(31).to(device)

In [14]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6095534651666075 0.42638717378888813
Epoch:  1


0.27584413217531667 0.1572898072855813
Epoch:  2


0.17220639739487623 0.1643246284552983
Epoch:  3


0.16241995788909294 0.1534143409558705
Epoch:  4


0.15941878951884605 0.14372228724615915
Epoch:  5


0.1581210252400991 0.1351365659918104
Epoch:  6


0.15574538143905434 0.13726997801235744
Epoch:  7


0.1552267449127661 0.13081256300210953
Epoch:  8


0.15504650529977437 0.13275532637323653
Epoch:  9


0.1532012449728476 0.1299230850168637
Epoch:  10


0.1526603606101629 0.13696764409542084
Epoch:  11


0.15196307486778982 0.12708027554409845
Epoch:  12


0.151649520606608 0.1273114915405001
Epoch:  13


0.1514721146306476 0.12864375220877783
Epoch:  14


0.14968932842886126 0.12917815148830414
Epoch:  15


0.15039347998193792 0.13150158524513245
Epoch:  16


0.15105467470916542 0.12988991396767752
Epoch:  17


0.14915431834555962 0.1266439322914396
Epoch:  18


0.1501404695414208 0.1292763991015298
Epoch:  19


0.14924846549291867 0.13162065190928324
Epoch:  20


0.1497126088754551 0.12928856377090728
Epoch:  21


0.14831371887310132 0.12931438322578157
Epoch:  22


0.1480122587165317 0.12514532144580567
Epoch:  23


0.1493755256807482 0.12579559534788132
Epoch:  24


0.14793894500345797 0.12629973143339157
Epoch:  25


0.1463006779148772 0.126618606703622
Epoch:  26


0.14658730577778173 0.1253012791275978
Epoch:  27


0.1468738140286626 0.1272184763635908
Epoch:  28


0.1458971689681749 0.1258364947778838
Epoch    28: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  29


0.1445743534210566 0.12307483383587428
Epoch:  30


0.145421191244512 0.12227024457284383
Epoch:  31


0.14474075267443787 0.12187944991247994
Epoch:  32


0.14366637210588198 0.12146719438689095
Epoch:  33


0.1432286710352511 0.1215011402964592
Epoch:  34


0.14352181634387454 0.12123620510101318
Epoch:  35


0.14385392053707227 0.12173103115388326
Epoch:  36


0.1425648565227921 0.1215522534080914
Epoch:  37


0.14223809137537674 0.12113101035356522
Epoch:  38


0.14357005784640442 0.12109676322766713
Epoch:  39


0.14338118603100647 0.12154188539300646
Epoch:  40


0.14227985449739405 0.12134218961000443
Epoch:  41


0.14196198775961594 0.12075209085430418
Epoch:  42


0.14231571394044 0.12090556855712618
Epoch:  43


0.1403164444742976 0.1218029015830585
Epoch:  44


0.14087488039119825 0.12145199733121055
Epoch:  45


0.1424598875077995 0.12149074992963246
Epoch:  46


0.1408886357739165 0.12103209963866643
Epoch:  47


0.14121071511023753 0.12103564611503057
Epoch    47: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  48


0.14002141557835243 0.12122959537165505
Epoch:  49


0.141332904229293 0.12114547618797847
Epoch:  50


0.14112989060782097 0.12118463537522725
Epoch:  51


0.14202740949553413 0.12133735099009105
Epoch:  52


0.1406900065170752 0.12130988708564214
Epoch:  53


0.14087881832509427 0.12108986292566572
Epoch    53: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  54


0.1405415434289623 0.1212663661156382
Epoch:  55


0.1409889822070663 0.121172054537705
Epoch:  56


0.14029801415430532 0.12125657711710248
Epoch:  57


0.1411124008732873 0.12123310778822217
Epoch:  58


0.14144195737065496 0.12120584824255534
Epoch:  59


0.14048606318396492 0.12129570437329155
Epoch    59: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  60


0.1406134975117606 0.12125566814626966
Epoch:  61


0.14128079929867307 0.12115991967064994
Epoch:  62


0.14066575225946065 0.12115862220525742
Epoch:  63


0.14128590636962168 0.1213421140398298
Epoch:  64


0.1412368847711666 0.12113505282572337
Epoch:  65


0.13943517167825956 0.12117881647178105
Epoch    65: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  66
