In [1]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [2]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [3]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [4]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [5]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [6]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [7]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [8]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [9]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [10]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [11]:
until_x = None

In [12]:
# Parameters
until_x = 8


In [13]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [14]:
# Instantiate the model
model = Task5Model(31).to(device)

In [15]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [16]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6294814912048546 0.4758005865982601
Epoch:  1


0.320378825873942 0.20308069459029607
Epoch:  2


0.19061749569467595 0.17582128516265325
Epoch:  3


0.17824453840384613 0.18085396502699172
Epoch:  4


0.17775379201850375 0.18369354094777787
Epoch:  5


0.17780850022225766 0.16454155104500906
Epoch:  6


0.175978650917878 0.16096745431423187
Epoch:  7


0.1752600839009156 0.15847402385302953
Epoch:  8


0.17259123438113444 0.1570208775145667
Epoch:  9


0.17258853404908567 0.16377034783363342
Epoch:  10


0.17170155209463997 0.16996414746556962
Epoch:  11


0.17163614244074435 0.15689627400466374
Epoch:  12


0.17000578988242793 0.15133467529501235
Epoch:  13


0.167926064616925 0.1547621807881764
Epoch:  14


0.1679271451524786 0.14968678780964442
Epoch:  15


0.16743450833333506 0.15488020862851823
Epoch:  16


0.16521629973037824 0.15100773743220738
Epoch:  17


0.16569835350320145 0.15083751295294082
Epoch:  18


0.16412626770702568 0.14976135960647038
Epoch:  19


0.16283960761250676 0.14256823807954788
Epoch:  20


0.1632179323885892 0.14745989654745376
Epoch:  21


0.16186207693976326 0.14059879311493464
Epoch:  22


0.16118144586279587 0.14065341012818472
Epoch:  23


0.16106206057844935 0.1389835849404335
Epoch:  24


0.16009865620651761 0.13938289135694504
Epoch:  25


0.16047278771529327 0.13885335624217987
Epoch:  26


0.16026578561679736 0.1365745429481779
Epoch:  27


0.15977381572530075 0.1353489969457899
Epoch:  28


0.15920112745181933 0.13700982183218002
Epoch:  29


0.15889913769992622 0.13759365571396692
Epoch:  30


0.1593799329287297 0.1340396531990596
Epoch:  31


0.1567485823824599 0.13485002304826463
Epoch:  32


0.15769914114797437 0.1340621347938265
Epoch:  33


0.15784967227562055 0.13328950426408223
Epoch:  34


0.15614510307440888 0.13002419897488185
Epoch:  35


0.15525655528983553 0.13367733252899988
Epoch:  36


0.15744088267957843 0.1341846553342683
Epoch:  37


0.15585299037598274 0.13453172466584615
Epoch:  38


0.15560464198524887 0.1476669449891363
Epoch:  39


0.15611475947740916 0.1303889900445938
Epoch:  40


0.15466211897295876 0.14811228854315622
Epoch    40: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  41


0.1547308958865501 0.6688489030514445
Epoch:  42


0.15258268730060473 0.2197571716138295
Epoch:  43


0.15363801129766413 0.12760095404727118
Epoch:  44


0.1523693477785265 0.23365022987127304
Epoch:  45


0.1534586866965165 0.13034976486648833
Epoch:  46


0.15193535790250107 0.20890024304389954
Epoch:  47


0.1511623935924994 0.5619351693562099
Epoch:  48


0.15152949416959607 0.5544223955699376
Epoch:  49


0.15128311675948067 0.2828788438013622
Epoch    49: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  50


0.1512955866149954 0.5520581666912351
Epoch:  51


0.150971630135098 0.4846420543534415
Epoch:  52


0.1512324286473764 0.38568646567208426
Epoch:  53


0.15102361867556702 0.36398702008383615
Epoch:  54


0.15011998044477926 0.14280559654746736
Epoch:  55


0.15175316221005208 0.3516738350902285
Epoch    55: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  56


0.15127409954328794 0.12693022191524506
Epoch:  57


0.1520081186616743 0.23925711001668656
Epoch:  58


0.15081297465272853 0.5142629487173898
Epoch:  59


0.15275927210176313 0.34885417350700926
Epoch:  60


0.15068872675702377 0.40623670922858374
Epoch:  61


0.15162335738942428 0.45387537138802664
Epoch:  62


0.15184512895506783 0.40975136416299
Epoch    62: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  63


0.1512332917065234 0.3981132443462099
Epoch:  64


0.15117432620074298 0.293951578438282
Epoch:  65


0.15213108827938904 0.42265775374003817
Epoch:  66


0.15082940741165266 0.4271500302212579
Epoch:  67


0.15125145944389137 0.3667464234999248
Epoch:  68


0.1514949242810945 0.3829167942915644
Epoch    68: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  69


0.1498693086811014 0.3256689448441778
Epoch:  70


0.15252382811662313 0.35669766685792376
Epoch:  71


0.15120091389965368 0.17637317201920918
Epoch:  72


0.15154217384956978 0.37587440652506693
Epoch:  73


0.1513359659426921 0.44406640955380033
Epoch:  74


0.15052319459013036 0.12645098567008972
Epoch:  75


0.15188815263477531 0.2346638845545905
Epoch:  76


0.1510806856928645 0.4023417809179851
Epoch:  77


0.15034309029579163 0.32848276623657774
Epoch:  78


0.1512293167210914 0.23821478656360082
Epoch:  79


0.15120907208404025 0.5664086320570537
Epoch:  80


0.1519973342483108 0.47553340452057974
Epoch:  81


0.15200309978949056 0.13110945160899842
Epoch:  82


0.1505692709942122 0.31904841533729006
Epoch:  83


0.15089328909242475 0.3425449877977371
Epoch:  84


0.15077566133963094 0.47867391790662495
Epoch:  85


0.15117072656347946 0.43779837020805906
Epoch:  86


0.1515181439148413 0.12602965640170233
Epoch:  87


0.15184196829795837 0.14215678508792604
Epoch:  88


0.15105768192458796 0.4185379424265453
Epoch:  89


0.15167514979839325 0.12798180431127548
Epoch:  90


0.15216134166395343 0.2383463584950992
Epoch:  91


0.1510912131618809 0.42415650827544077
Epoch:  92


0.15125918307819883 0.5012934591088977
Epoch:  93


0.15101085844877604 0.33486352222306387
Epoch:  94


0.1503002937581088 0.15908702250037873
Epoch:  95


0.1511098694156956 0.1308171312723841
Epoch:  96


0.1519587736677479 0.27377272716590334
Epoch:  97


0.15203548685924426 0.4255981275013515
Epoch:  98


0.1509941671345685 0.2516107772077833
Epoch:  99


0.1512968137457564 0.34654094172375544
