In [1]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [2]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [3]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [4]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [5]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [6]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [7]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [8]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [9]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [10]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [11]:
until_x = None

In [12]:
# Parameters
until_x = 17


In [13]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [14]:
# Instantiate the model
model = Task5Model(31).to(device)

In [15]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [16]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.625396873499896 0.5368342484746661
Epoch:  1


0.31440169303803833 0.21292820572853088
Epoch:  2


0.19119746499770396 0.4357011020183563
Epoch:  3


0.18313166861598557 0.1673694338117327
Epoch:  4


0.17746855882374016 0.16363620970930373
Epoch:  5


0.17541568786711306 0.16609402426651546
Epoch:  6


0.175397636519896 0.16545592887060984
Epoch:  7


0.1747699657807479 0.16457717972142355
Epoch:  8


0.17403516576096817 0.1689433434179851
Epoch:  9


0.1718355594454585 0.16218159454209463
Epoch:  10


0.1723452053360037 0.16166183352470398
Epoch:  11


0.17054481441910202 0.16866172850131989
Epoch:  12


0.17154923927139593 0.15633271421704972
Epoch:  13


0.17059502690225034 0.1508812074150358
Epoch:  14


0.16832294737970507 0.1580567764384406
Epoch:  15


0.16843083661955757 0.1573769109589713
Epoch:  16


0.16698063587820208 0.15945706835814885
Epoch:  17


0.16701421222171267 0.15567598811217717
Epoch:  18


0.16590234234526352 0.14898498143468583
Epoch:  19


0.16515617878050418 0.14858670106955937
Epoch:  20


0.16458612519341545 0.155309653707913
Epoch:  21


0.16598811181815895 0.15532797362123216
Epoch:  22


0.16338163493452845 0.14070600590535573
Epoch:  23


0.16315090616007108 0.14798293794904435
Epoch:  24


0.1631442960855123 0.14035468761410033
Epoch:  25


0.1625598491849126 0.13981019066912787
Epoch:  26


0.1622234321123845 0.13661354886634008
Epoch:  27


0.16066078157038302 0.13881177880934306
Epoch:  28


0.16153861783646248 0.1403238603046962
Epoch:  29


0.16061040114712072 0.1507549211382866
Epoch:  30


0.1603461854361199 0.16397747823170253
Epoch:  31


0.15999556232143092 0.13507691025733948
Epoch:  32


0.15819208485049172 0.19812746345996857
Epoch:  33


0.1585125786227149 0.13378190568515233
Epoch:  34


0.15769102243152824 0.1415325669305665
Epoch:  35


0.15753895849794955 0.1366036076630865
Epoch:  36


0.1580274092989999 0.13349560967513494
Epoch:  37


0.15793644576459318 0.1385373047419957
Epoch:  38


0.15700494034870252 0.13660470715590886
Epoch:  39


0.15738677173047452 0.13658985389130457
Epoch:  40


0.15727053059113993 0.13647779609475816
Epoch:  41


0.15596319292042707 0.13300172239542007
Epoch:  42


0.15613480033101262 0.1342850604227611
Epoch:  43


0.15596077329403646 0.1313037606222289
Epoch:  44


0.15589142047070167 0.13148277359349386
Epoch:  45


0.15466392724900632 0.1328306570649147
Epoch:  46


0.1566918746039674 0.13328099995851517
Epoch:  47


0.15482706798089518 0.1302075833082199
Epoch:  48


0.15531094211178856 0.1332392767071724
Epoch:  49


0.1547210804513983 0.1285457589796611
Epoch:  50


0.1552330279672468 0.13161566534212657
Epoch:  51


0.15342103548952052 0.1343615416969572
Epoch:  52


0.15364904218428843 0.132321403494903
Epoch:  53


0.15435784531606211 0.1336826863033431
Epoch:  54


0.154694163316005 0.1327694888625826
Epoch:  55


0.15349384578498634 0.12962881262813294
Epoch    55: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  56


0.15299162147818385 0.12709803027766092
Epoch:  57


0.15227996618361087 0.12670136030231202
Epoch:  58


0.1522046601450121 0.12733112701347896
Epoch:  59


0.15102122965696696 0.12644285815102713
Epoch:  60


0.15116399647416295 0.12651381003005163
Epoch:  61


0.1500170452369226 0.12645712069102696
Epoch:  62


0.1504636447171907 0.12656683261905396
Epoch:  63


0.15085618640925433 0.1264155868973051
Epoch:  64


0.1501352795072504 0.12639379714216506
Epoch:  65


0.1518934199938903 0.1269956316266741
Epoch:  66


0.14977716996863083 0.12687850637095316
Epoch:  67


0.1494415442685823 0.1262613364628383
Epoch:  68


0.15007824994422295 0.12728129114423478
Epoch:  69


0.15005143832516027 0.12682724744081497
Epoch:  70


0.15098152974167386 0.1261416290487562
Epoch:  71


0.1504002132931271 0.12632186072213308
Epoch:  72


0.14994922038671132 0.1265433920281274
Epoch:  73


0.14979990591873993 0.12624628309692656
Epoch:  74


0.14952961617224925 0.12649775509323394
Epoch:  75


0.1494108502929275 0.12679190082209452
Epoch:  76


0.149590762080373 0.1269768700003624
Epoch    76: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  77


0.15087906009442098 0.12674192552055633
Epoch:  78


0.14939626769439593 0.1263836600950786
Epoch:  79


0.15031465565836108 0.12648729447807586
Epoch:  80


0.14873415473345164 0.12625166773796082
Epoch:  81


0.14934349623886314 0.12640795218093054
Epoch:  82


0.1499061854304494 0.12672545547996247
Epoch    82: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  83


0.14913359364947756 0.1265870417867388
Epoch:  84


0.15054073486779188 0.12616957617657526
Epoch:  85


0.1502662288981515 0.12608012131282262
Epoch:  86


0.14911592651057887 0.1262104426111494
Epoch:  87


0.14898952841758728 0.12614215378250396
Epoch:  88


0.14902469475527066 0.12631957339388983
Epoch:  89


0.14888334556205854 0.12634829325335367
Epoch:  90


0.1493384370932708 0.12615491769143514
Epoch:  91


0.14980485068785176 0.1262046577675002
Epoch    91: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  92


0.14904360513429385 0.126168535224029
Epoch:  93


0.14909509589543213 0.12645878110613143
Epoch:  94


0.14951738997085676 0.12637414038181305
Epoch:  95


0.14947655998371742 0.12631931049483164
Epoch:  96


0.14952115634003202 0.12627078060592925
Epoch:  97


0.14828423954345085 0.12632037379911967
Epoch    97: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  98


0.15014397534164223 0.1259604894689151
Epoch:  99


0.14934480069456874 0.12603557854890823
