In [1]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor



In [2]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [3]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [4]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [5]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [6]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [7]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [8]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [9]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [10]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [11]:
until_x = None

In [12]:
# Parameters
until_x = 13


In [13]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [14]:
# Instantiate the model
model = Task5Model(31).to(device)

In [15]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [16]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6348149244849747 0.5074271730014256
Epoch:  1


0.3261410755080146 0.21467718056270055
Epoch:  2


0.19099872418352076 0.16844362659113749
Epoch:  3


0.1794001415774629 0.16673051246574946
Epoch:  4


0.17824202695408384 0.16400105612618582
Epoch:  5


0.17677022395907221 0.16486880396093642
Epoch:  6


0.1751593431910953 0.16151499109608786
Epoch:  7


0.17443918940183278 0.16623867835317338
Epoch:  8


0.17187765160122434 0.15926972670214518
Epoch:  9


0.17161364209007574 0.160632991365024
Epoch:  10


0.17254699202808174 0.15798045694828033
Epoch:  11


0.1698169120260187 0.15537547852311814
Epoch:  12


0.16971903757469073 0.15879796232495988
Epoch:  13


0.16922536572894534 0.1591700783797673
Epoch:  14


0.16817473116758708 0.15271754562854767
Epoch:  15


0.16862151429459854 0.15028979735715048
Epoch:  16


0.16624752331424403 0.14737151456730707
Epoch:  17


0.16627010018438906 0.14554002029555185
Epoch:  18


0.1644086821659191 0.14235893424068177
Epoch:  19


0.16457034848831795 0.1401932133095605
Epoch:  20


0.16489216121467384 0.14830162695475987
Epoch:  21


0.16362836876431028 0.14175221217530115
Epoch:  22


0.16305420245673205 0.14662457363946096
Epoch:  23


0.1627557877753232 0.13977890461683273
Epoch:  24


0.16265275107847676 0.1513437373297555
Epoch:  25


0.15997617188337687 0.13495523588997976
Epoch:  26


0.15970862918608897 0.13599678874015808
Epoch:  27


0.16052261276825056 0.1476525515317917
Epoch:  28


0.161463662981987 0.13635938933917455
Epoch:  29


0.15945680157558337 0.13974982074328832
Epoch:  30


0.15991441341670784 0.1410470147218023
Epoch:  31


0.1581908339584196 0.13368791341781616
Epoch:  32


0.15726063058182999 0.1333231542791639
Epoch:  33


0.15854315620821877 0.13337591184037073
Epoch:  34


0.15836499953592145 0.1386722390140806
Epoch:  35


0.15736466525374232 0.1327985610280718
Epoch:  36


0.1591264009475708 0.13433408737182617
Epoch:  37


0.15752500215092222 0.13366644723074778
Epoch:  38


0.1560425093850574 0.1345797394003187
Epoch:  39


0.15788873667652542 0.1347425218139376
Epoch:  40


0.15654706149487882 0.13419517342533385
Epoch:  41


0.15668917910472765 0.13367812122617448
Epoch    41: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  42


0.156413463321892 0.12752909000430787
Epoch:  43


0.1541742630101539 0.12783108864511764
Epoch:  44


0.1543919138006262 0.12782967516354152
Epoch:  45


0.15393086945688403 0.1281674685222762
Epoch:  46


0.15461629266674454 0.1282502082841737
Epoch:  47


0.15431794846380079 0.12808130575077875
Epoch:  48


0.1522130833284275 0.12815007673842566
Epoch    48: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  49


0.15302465292247566 0.12831133071865355
Epoch:  50


0.15371368905982455 0.12800192939383642
Epoch:  51


0.1530824488884694 0.12802090921572276
Epoch:  52


0.15307127382304217 0.12761706752436502
Epoch:  53


0.15253526536194054 0.12816386031252996
Epoch:  54


0.15327217933293935 0.12771821447781154
Epoch    54: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  55


0.15223320754798683 0.12774338041033065
Epoch:  56


0.15310743934399373 0.12791566870042256
Epoch:  57


0.15245758721957337 0.12796068510838918
Epoch:  58


0.15382928501915288 0.12790644381727492
Epoch:  59


0.15203289445993062 0.12803819349833898
Epoch:  60


0.15292485018034238 0.12765358814171382
Epoch    60: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  61


0.15336550570823052 0.12796926179102489
Epoch:  62


0.15326134618875142 0.12818448032651628
Epoch:  63


0.15285472813490275 0.127875060907432
Epoch:  64


0.15282451703741745 0.12790632035051072
Epoch:  65


0.15222584315248439 0.12750147708824702
Epoch:  66


0.15261846659956751 0.12764689539160048
Epoch:  67


0.15358885035321518 0.12748428859880992
Epoch:  68


0.15123141657661748 0.12784318625926971
Epoch:  69


0.15211096043522293 0.12781484105757304
Epoch:  70


0.1534246423759976 0.1276436767407826
Epoch:  71


0.15284886754847862 0.12807353798832213
Epoch:  72


0.15336682667603363 0.12781070811407907
Epoch:  73


0.15154421208678065 0.1276169600231307
Epoch    73: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  74


0.15312934848102364 0.12793392900909697
Epoch:  75


0.15319231796909022 0.12809014533247268
Epoch:  76


0.1524977720267064 0.12764026437486922
Epoch:  77


0.15248491353279836 0.12772323936223984
Epoch:  78


0.15238776078095306 0.12780489772558212
Epoch:  79


0.15219723775580124 0.12798402351992472
Epoch:  80


0.15361577514055613 0.12793173428092683
Epoch:  81


0.15262062364333384 0.12841974305255072
Epoch:  82


0.1539186436582256 0.12816061079502106
Epoch:  83


0.1525699588898066 0.1278463335973876
Epoch:  84


0.15324402580390106 0.12762401678732463
Epoch:  85


0.15301245571793737 0.12819562000887735
Epoch:  86


0.1529346563526102 0.1277089353118624
Epoch:  87


0.1538218175237243 0.12827551364898682
Epoch:  88


0.15255548140487155 0.12785180445228303
Epoch:  89


0.15364573291830114 0.12773243337869644
Epoch:  90


0.15269940567983165 0.12772198340722493
Epoch:  91


0.15307409497531685 0.12817920105797903
Epoch:  92
