In [1]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [2]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [3]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [4]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [5]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [6]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [7]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [8]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [9]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [10]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [11]:
until_x = None

In [12]:
# Parameters
until_x = 7


In [13]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [14]:
# Instantiate the model
model = Task5Model(31).to(device)

In [15]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [16]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6360199660868258 0.4975491336413792
Epoch:  1


0.328781316811974 0.21143984155995504
Epoch:  2


0.1931641750239037 0.17868795352322714
Epoch:  3


0.18207090408415408 0.1655644987310682
Epoch:  4


0.17808007952329274 0.16784940021378653
Epoch:  5


0.1766533618037765 0.16346548071929387
Epoch:  6


0.17431821774792028 0.15769017381327494
Epoch:  7


0.17372713701145068 0.17338807880878448
Epoch:  8


0.17186338071887558 0.166696412222726
Epoch:  9


0.1710748362380105 0.15464307793549129
Epoch:  10


0.17050989133280678 0.14970492039408004
Epoch:  11


0.16876157353053223 0.17300360543387278
Epoch:  12


0.16743112214513728 0.14964588305779866
Epoch:  13


0.1663495845891334 0.16036679702145712
Epoch:  14


0.1650481562356691 0.14770772201674326
Epoch:  15


0.163218915865228 0.14126968809536525
Epoch:  16


0.16360014797867956 0.14219065116984503
Epoch:  17


0.16094503249671008 0.14383813525949204
Epoch:  18


0.1617466549615602 0.14760314779622213
Epoch:  19


0.16014985458270922 0.13641983909266336
Epoch:  20


0.16014955051847407 0.13770101432289397
Epoch:  21


0.1593871048173389 0.14240558658327376
Epoch:  22


0.16018171004346898 0.13322967078004563
Epoch:  23


0.15958288754965808 0.1414336466363498
Epoch:  24


0.158597548668449 0.1518199826989855
Epoch:  25


0.15889796534100095 0.13413252362183162
Epoch:  26


0.15844014850822655 0.13462591809885843
Epoch:  27


0.1581646582564792 0.13598636644227163
Epoch:  28


0.15733922776338216 0.13999594748020172
Epoch    28: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  29


0.15667577893347354 0.1321083019886698
Epoch:  30


0.15446636845936645 0.1308144543852125
Epoch:  31


0.15539278653827873 0.13019965482609613
Epoch:  32


0.15426216697370684 0.12967796623706818
Epoch:  33


0.15376288343120265 0.12959440584693635
Epoch:  34


0.15404453309806618 0.1288865421499525
Epoch:  35


0.15332358471445134 0.12903713860682078
Epoch:  36


0.15418716419387507 0.12961381993123464
Epoch:  37


0.15360288724705978 0.1289640643766948
Epoch:  38


0.15367070686172796 0.128567673265934
Epoch:  39


0.15274138345911698 0.1287125125527382
Epoch:  40


0.1539593121489963 0.12889518163033895
Epoch:  41


0.15388076490647085 0.12839365324803761
Epoch:  42


0.15355887203603177 0.12877806169646128
Epoch:  43


0.15296633823497877 0.1284793359892709
Epoch:  44


0.15306951790242582 0.12831053350652968
Epoch:  45


0.15096567933623856 0.1281433967607362
Epoch:  46


0.15244822002745964 0.12856780737638474
Epoch:  47


0.15197070184591654 0.12756279855966568
Epoch:  48


0.15213232185389544 0.12759528628417424
Epoch:  49


0.15216607983047897 0.12792430179459707
Epoch:  50


0.15193608403205872 0.12835162026541574
Epoch:  51


0.15264784444022822 0.1281109462891306
Epoch:  52


0.15224986744893565 0.12819719846759522
Epoch:  53


0.15131483891525785 0.12801543729645865
Epoch    53: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  54


0.15158392327862816 0.12797276888574874
Epoch:  55


0.15116583414979884 0.1275476547224181
Epoch:  56


0.15290335988676226 0.12730553320475987
Epoch:  57


0.15163609466037234 0.12723576490368163
Epoch:  58


0.15156403949131836 0.12708330154418945
Epoch:  59


0.15216848495844248 0.1272608201418604
Epoch:  60


0.15070281117349058 0.12721180383648192
Epoch:  61


0.15039985526252436 0.12709931071315492
Epoch:  62


0.15061467040229487 0.1274793062891279
Epoch:  63


0.15041057926577492 0.12717447642769134
Epoch:  64


0.15091324013632698 0.12711606387581145
Epoch    64: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  65


0.15089042162573016 0.12755175679922104
Epoch:  66


0.15103783599428228 0.12734200060367584
Epoch:  67


0.15207086100771622 0.1275863562311445
Epoch:  68


0.15103581185276443 0.1269850464803832
Epoch:  69


0.15159091353416443 0.12743792789322989
Epoch:  70


0.1504133851141543 0.12701417612177984
Epoch:  71


0.1513437085055016 0.12716179766825267
Epoch:  72


0.15128864911762444 0.1270884679896491
Epoch:  73


0.15239670268587163 0.12705651564257486
Epoch:  74


0.151638901716954 0.12723558396100998
Epoch    74: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  75


0.15118392939503128 0.12737839136804854
Epoch:  76


0.15126180447436668 0.12732744216918945
Epoch:  77


0.15009727590792887 0.12728447360651834
Epoch:  78


0.15133672066636988 0.1271268767969949
Epoch:  79


0.1510868265822127 0.12756528705358505
Epoch:  80


0.15127063764108195 0.12719143714223588
Epoch    80: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  81


0.15141458849649173 0.12731300507272994
Epoch:  82


0.15142906517595858 0.12747095205954143
Epoch:  83


0.15173129254096263 0.12715065798589162
Epoch:  84


0.15102390824137507 0.12701360349144256
Epoch:  85


0.15159206132631045 0.12729622742959432
Epoch:  86


0.15157003096632055 0.12752717201198852
Epoch:  87


0.15204368290063497 0.12738151103258133
Epoch:  88


0.15193223550512985 0.12714464323861258
Epoch:  89


0.1523632830059206 0.1273525825568608
Epoch:  90


0.151222195174243 0.12712973994868143
Epoch:  91


0.1521903167705278 0.1271206544978278
Epoch:  92


0.1512838219468658 0.12735704226153238
Epoch:  93
