In [1]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor



In [2]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [3]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [4]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [5]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [6]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [7]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [8]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [9]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [10]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [11]:
until_x = None

In [12]:
# Parameters
until_x = 12


In [13]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [14]:
# Instantiate the model
model = Task5Model(31).to(device)

In [15]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [16]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6400399191959484 0.4988540325845991
Epoch:  1


0.33435830715540293 0.20811938600880758
Epoch:  2


0.1919557077659143 0.17347814994198935
Epoch:  3


0.17965288339434443 0.16353532884802138
Epoch:  4


0.177974895045564 0.16221206103052413
Epoch:  5


0.17570950131158572 0.15882674711091177
Epoch:  6


0.1732342923815186 0.17061900241034372
Epoch:  7


0.17231996236620722 0.15369994725499833
Epoch:  8


0.17057432435654304 0.1747113232101713
Epoch:  9


0.16913859304544088 0.1546351249728884
Epoch:  10


0.16766639657922694 0.15489583143166133
Epoch:  11


0.16879175885303602 0.14714493921824864
Epoch:  12


0.16636468832557266 0.14968080818653107
Epoch:  13


0.16495671022582697 0.15199168452194758
Epoch:  14


0.16490710788481944 0.1476514009492738
Epoch:  15


0.1639959892711124 0.14593505433627538
Epoch:  16


0.16397735557040652 0.1447466633149556
Epoch:  17


0.1638087300029961 0.1454915063721793
Epoch:  18


0.16223044008822055 0.14134327002934047
Epoch:  19


0.16127085001082034 0.1429819698844637
Epoch:  20


0.16133681584048915 0.14054709992238454
Epoch:  21


0.16013259782984451 0.13927354450736726
Epoch:  22


0.16057559928378543 0.14024493736880167
Epoch:  23


0.15971765244329297 0.13625964735235488
Epoch:  24


0.15946452400168856 0.13948559015989304
Epoch:  25


0.158910917269217 0.13496648626668112
Epoch:  26


0.15880906863792524 0.136321202984878
Epoch:  27


0.1585013556319314 0.13739783210413797
Epoch:  28


0.15548883136865255 0.13452257748161042
Epoch:  29


0.1571103237770699 0.13746962802750723
Epoch:  30


0.15601606183760874 0.1398708830986704
Epoch:  31


0.15793482999543887 0.1407231997166361
Epoch:  32


0.15845900210174355 0.13493934380156652
Epoch:  33


0.15706841768445196 0.1333855954664094
Epoch:  34


0.15684003403057922 0.13242376382861817
Epoch:  35


0.15602838590338425 0.13633940794638225
Epoch:  36


0.15479352868892052 0.13542686722108296
Epoch:  37


0.15433493095475273 0.13340726494789124
Epoch:  38


0.155533949668343 0.1306648158601352
Epoch:  39


0.15419110655784607 0.13242126256227493
Epoch:  40


0.15556970359505834 0.12998962828091212
Epoch:  41


0.1539073739502881 0.13306476495095662
Epoch:  42


0.15302888726865924 0.13374593534639903
Epoch:  43


0.15460896773918256 0.13011912895100458
Epoch:  44


0.15481416800537626 0.13017821950571878
Epoch:  45


0.15308171470423002 0.13280844794852392
Epoch:  46


0.1541155826401066 0.1335736640862056
Epoch    46: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  47


0.15390609406136177 0.12772014737129211
Epoch:  48


0.1498970558514466 0.12644503159182413
Epoch:  49


0.15069425106048584 0.12681523604052408
Epoch:  50


0.15118814602091507 0.12770424676792963
Epoch:  51


0.1513599922528138 0.12628080802304403
Epoch:  52


0.1505438693471857 0.1271487729890006
Epoch:  53


0.14986992768339208 0.12701507870640075
Epoch:  54


0.1512832387879088 0.12494807690382004
Epoch:  55


0.15114269425740112 0.12626386540276663
Epoch:  56


0.1493558243319795 0.1267411517245429
Epoch:  57


0.14976036991621997 0.12608984857797623
Epoch:  58


0.15001568278750857 0.12670844899756567
Epoch:  59


0.1499505542420052 0.12631441227027349
Epoch:  60


0.15081586145065926 0.1271493679710797
Epoch    60: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  61


0.1495667269101014 0.1266181202871459
Epoch:  62


0.15080911686291565 0.12609891487019403
Epoch:  63


0.14923150837421417 0.1265073333467756
Epoch:  64


0.14959667381402608 0.1257944405078888
Epoch:  65


0.14913762985049067 0.1261381847517831
Epoch:  66


0.1491097932731783 0.1255440658756665
Epoch    66: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  67


0.15007747911118172 0.1262112142784255
Epoch:  68


0.14985356983300802 0.12698944445167268
Epoch:  69


0.14868057257420308 0.1262152705873762
Epoch:  70


0.1486437743580019 0.12592740889106477
Epoch:  71


0.14808855910558957 0.12630340031215123
Epoch:  72


0.149746461897283 0.12623182045561926
Epoch    72: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  73


0.14868490438203555 0.12650392310959951
Epoch:  74


0.15084243102653608 0.12671072568212235
Epoch:  75


0.1484090226727563 0.12627967872789927
Epoch:  76


0.14915232682550275 0.12646427963461196
Epoch:  77


0.1497059013392474 0.12563845940998622
Epoch:  78


0.14876856352831866 0.12611964025667735
Epoch    78: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  79
