In [1]:
# Parameters
until_x = 0


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 96, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 96, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 96, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [12]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [13]:
# Instantiate the model
model = Task5Model(31).to(device)

In [14]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6495292997360229 0.5417566299438477
Epoch:  1


0.42992380142211917 0.2809699237346649
Epoch:  2


0.22779927015304566 0.18023620843887328
Epoch:  3


0.18595250010490416 0.21927540004253387
Epoch:  4


0.17927402317523955 0.17048765420913697
Epoch:  5


0.17894121050834655 0.16122960448265075
Epoch:  6


0.17609158754348755 0.18255728781223296
Epoch:  7


0.17499767482280731 1.1290406584739685
Epoch:  8


0.17495605766773223 0.16927126348018645
Epoch:  9


0.1725289034843445 0.16411905288696288
Epoch:  10


0.17088813781738282 0.16203655004501344
Epoch:  11


0.1694609671831131 0.16084418296813965
Epoch:  12


0.16947054445743562 0.17218909561634063
Epoch:  13


0.1682624661922455 0.15577100813388825
Epoch:  14


0.1660328674316406 0.1548050820827484
Epoch:  15


0.1659462857246399 0.15834370851516724
Epoch:  16


0.16519391417503357 0.15629777312278748
Epoch:  17


0.16374197721481323 0.15061074495315552
Epoch:  18


0.164439537525177 0.14214247465133667
Epoch:  19


0.162274010181427 0.14006509780883789
Epoch:  20


0.16262178301811217 0.14293087124824524
Epoch:  21


0.16168421804904937 0.14252581298351288
Epoch:  22


0.16197241544723512 0.15045776665210725
Epoch:  23


0.16138612151145934 0.13871206045150758
Epoch:  24


0.1613065254688263 0.15122745633125306
Epoch:  25


0.15990424633026123 0.13741881847381593
Epoch:  26


0.1586475569009781 0.13896806836128234
Epoch:  27


0.15933184027671815 0.13244208991527556
Epoch:  28


0.159216348528862 0.14154973328113557
Epoch:  29


0.15952931582927704 0.13507821559906005
Epoch:  30


0.15795873165130614 0.13614677786827087
Epoch:  31


0.15956431329250337 0.13580629825592042
Epoch:  32


0.15815064013004304 0.1338705062866211
Epoch:  33


0.15659219086170195 0.13359982818365096
Epoch    33: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  34


0.15610244870185852 0.13101682662963868
Epoch:  35


0.15588215947151185 0.13008609265089036
Epoch:  36


0.15630802750587464 0.1306568905711174
Epoch:  37


0.15462659120559694 0.13016291558742524
Epoch:  38


0.15474285423755646 0.1292622819542885
Epoch:  39


0.15558235824108124 0.13037583231925964
Epoch:  40


0.15411987900733948 0.13017963916063308
Epoch:  41


0.15500251054763795 0.13008431643247603
Epoch:  42


0.15475252389907837 0.1300549626350403
Epoch:  43


0.154502757191658 0.13013555258512496
Epoch:  44


0.15356216132640838 0.12933223843574523
Epoch    44: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  45


0.15499161660671235 0.1291447564959526
Epoch:  46


0.1537906163930893 0.12936596125364302
Epoch:  47


0.15468294262886048 0.1297299697995186
Epoch:  48


0.153862122297287 0.12930111736059188
Epoch:  49


0.1528938388824463 0.12923517376184462
Epoch:  50


0.15326693654060364 0.12908103317022324
Epoch:  51


0.15211209952831267 0.12926654815673827
Epoch:  52


0.15436922073364256 0.12912098467350006
Epoch:  53


0.15349411070346833 0.1290848046541214
Epoch:  54


0.15487501621246338 0.12933744341135026
Epoch:  55


0.15420033693313598 0.12948835492134095
Epoch:  56


0.15419720768928527 0.12905860394239427
Epoch:  57


0.15362213551998138 0.128827203810215
Epoch:  58


0.155265474319458 0.1291850984096527
Epoch:  59


0.15417777836322785 0.12903586775064468
Epoch:  60


0.15551359355449676 0.12906089425086975
Epoch:  61


0.1544060903787613 0.12907592505216597
Epoch:  62


0.1535507023334503 0.12905571907758712
Epoch:  63


0.1546986734867096 0.1291959896683693
Epoch    63: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  64


0.15386040151119232 0.12947728037834166
Epoch:  65


0.15444448947906494 0.12937363237142563
Epoch:  66


0.15371909022331237 0.1291481763124466
Epoch:  67


0.1533063244819641 0.12923792898654937
Epoch:  68


0.15461976945400238 0.12932038605213164
Epoch:  69


0.15395926892757417 0.12904773354530336
Epoch    69: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  70


0.15555325865745545 0.12899924218654632
Epoch:  71


0.1553328639268875 0.1291392356157303
Epoch:  72


0.15482391953468322 0.12940562814474105
Epoch:  73


0.15429629862308503 0.12928638309240342
Epoch:  74


0.1538229513168335 0.1288843646645546
Epoch:  75


0.15377758741378783 0.1290151908993721
Epoch    75: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  76


0.15448227226734162 0.12903258502483367
Epoch:  77


0.1550234389305115 0.12943221777677535
Epoch:  78


0.15385398268699646 0.12908436059951783
Epoch:  79


0.15540686428546904 0.1289363458752632
Epoch:  80


0.15381733059883118 0.12911877036094666
Epoch:  81


0.15362727642059326 0.12922793179750441
Epoch:  82
