In [1]:
# Parameters
until_x = 3


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [12]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [13]:
# Instantiate the model
model = Task5Model(31).to(device)

In [14]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6023286766297108 0.45527189118521555
Epoch:  1


0.27351861708873026 0.18573129389967238
Epoch:  2


0.18427158892154694 0.2754064074584416
Epoch:  3


0.1777774542570114 0.1839624366589955
Epoch:  4


0.17593488983205846 0.15873457065650395
Epoch:  5


0.17233449985852112 0.20645062625408173
Epoch:  6


0.17115130255351196 0.1526899359055928
Epoch:  7


0.16946936982709007 0.15300991918359483
Epoch:  8


0.1683724498426592 0.14829629233905248
Epoch:  9


0.16642620072171493 0.14410515981061117
Epoch:  10


0.16639151081845566 0.15512436628341675
Epoch:  11


0.16454745909652194 0.14328830582754953
Epoch:  12


0.16250556103281072 0.1397122868469783
Epoch:  13


0.1621753682155867 0.1375233924814633
Epoch:  14


0.16096102224813924 0.1409057304263115
Epoch:  15


0.15948568928886103 0.13929054460355214
Epoch:  16


0.15930978752471306 0.14067375979253224
Epoch:  17


0.16022633136929693 0.13575020113161632
Epoch:  18


0.15983339660876505 0.13930573633738927
Epoch:  19


0.15715182512193113 0.13156613175358092
Epoch:  20


0.15726638202731674 0.13265720116240637
Epoch:  21


0.1574057884312965 0.13733694915260589
Epoch:  22


0.15773691277246218 16.146227427891322
Epoch:  23


0.15612233853017962 0.13536451011896133
Epoch:  24


0.1557255548399848 0.1368466019630432
Epoch:  25


0.1553485304922671 0.13241085516554968
Epoch    25: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  26


0.15441445321650119 0.1269840087209429
Epoch:  27


0.153966057139474 0.12678042692797525
Epoch:  28


0.15465596600158796 0.1268012449145317
Epoch:  29


0.15270183738824483 0.12598646964345658
Epoch:  30


0.15294094866997487 0.1268050383244242
Epoch:  31


0.15228131292639552 0.12688791751861572
Epoch:  32


0.15438584741708394 0.12634885524000442
Epoch:  33


0.1540072902634337 0.12701735006911413
Epoch:  34


0.1529669431415764 0.12604122183152608
Epoch:  35


0.1514496456932377 0.12650284064667566
Epoch    35: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  36


0.15235767694743904 0.12602676344769342
Epoch:  37


0.15171762817614787 0.12599081013883864
Epoch:  38


0.15124116878251773 0.12606730950730188
Epoch:  39


0.15290832640351476 0.12593870503561838
Epoch:  40


0.15274326463003415 0.12621481929506576
Epoch:  41


0.15184837821367625 0.12618908818278993
Epoch:  42


0.1515307305632411 0.12614247841494425
Epoch:  43


0.1513858699315303 0.12593884553228105
Epoch:  44


0.15299572533852346 0.12600390932389668
Epoch:  45


0.15303777359627388 0.1262472772172519
Epoch    45: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  46


0.15231798951690262 0.1259824154632432
Epoch:  47


0.1521839845824886 0.12614317131893976
Epoch:  48


0.15247139133311607 0.12622243059532984
Epoch:  49


0.1525967161397676 0.12594801826136454
Epoch:  50


0.15319873674495801 0.1261108166405133
Epoch:  51


0.1521955434535001 0.12597697334630148
Epoch    51: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  52


0.15109653690376798 0.1260087021759578
Epoch:  53


0.1534122180294346 0.125839963555336
Epoch:  54


0.15183465545241898 0.1263134554028511
Epoch:  55


0.15179782743389542 0.12595581156866892
Epoch:  56


0.15147778432111483 0.1263538013611521
Epoch:  57


0.1523065401895626 0.12601962792021887
Epoch:  58


0.1518290578513532 0.1258962026664189
Epoch:  59


0.15245812728598312 0.12615745195320674
Epoch    59: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  60


0.15285234878192078 0.12596759306532995
Epoch:  61


0.15187485395251094 0.12630479357072286
Epoch:  62


0.1518541993321599 0.12603335614715303
Epoch:  63


0.15194860664573875 0.12638460206133978
Epoch:  64


0.15225004505466772 0.12605123541184834
Epoch:  65


0.15165332604098963 0.12616737612656184
Epoch:  66


0.15237862234180038 0.12601916279111589
Epoch:  67


0.15317336933032885 0.12600060339484895
Epoch:  68


0.15204707835171674 0.12587054180247442
Epoch:  69


0.15215580084839384 0.1257887099470411
Epoch:  70


0.15156838257570523 0.12597434329135077
Epoch:  71


0.1523418281529401 0.12613104922430857
Epoch:  72


0.15247385123291532 0.1261489572269576
Epoch:  73


0.15255714509938215 0.12615959559168136
Epoch:  74


0.15198479351159688 0.12617946309702738
Epoch:  75


0.15188973376879822 0.12588227753128325
Epoch:  76


0.1504867322541572 0.12619685700961522
Epoch:  77


0.15356381519420728 0.1260069693837847
Epoch:  78


0.15321310267255112 0.1261907826576914
Epoch:  79


0.15215693333664457 0.1259770882981164
Epoch:  80


0.15248213950041178 0.12602885599647248
Epoch:  81


0.15256070647690748 0.12606044752257212
Epoch:  82


0.15331671004359787 0.1259848890560014
Epoch:  83


0.1520223412159327 0.12595869494335993
Epoch:  84


0.15296319650637136 0.12602635685886657
Epoch:  85


0.15244365262018666 0.12587189355066844
Epoch:  86


0.15187814308179393 0.12640233125005448
Epoch:  87


0.15330359460534276 0.12631346923964365
Epoch:  88


0.1521903586548728 0.12585684763533728
Epoch:  89


0.15246769181779912 0.12593115333999907
Epoch:  90


0.15239653716216217 0.12601769821984427
Epoch:  91


0.15353912477557724 0.12589857833726065
Epoch:  92


0.15127521753311157 0.12630923092365265
Epoch:  93


0.15322527088023521 0.12593871248619898
Epoch:  94
