In [1]:
# Parameters
until_x = 13


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [12]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [13]:
# Instantiate the model
model = Task5Model(31).to(device)

In [14]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6059019621965047 0.4541343535695757
Epoch:  1


0.27502335769099157 0.17153444247586386
Epoch:  2


0.17454575445200946 0.14996140130928584
Epoch:  3


0.16448476387036814 0.17063543839114054
Epoch:  4


0.15950212043684883 0.14020215294190816
Epoch:  5


0.15801358786789146 0.13556247843163355
Epoch:  6


0.15701778112231074 0.1381280975682395
Epoch:  7


0.15542936486166878 0.13351037353277206
Epoch:  8


0.15587851485690554 0.12831378195966994
Epoch:  9


0.15405620191548322 0.13545321140970504
Epoch:  10


0.1536578926685694 0.13332107024533407
Epoch:  11


0.1533365986637167 0.1299674894128527
Epoch:  12


0.1518620458003637 0.12931975296565465
Epoch:  13


0.1511766185631623 0.1328465502176966
Epoch:  14


0.1514815039731361 0.13648286781140737
Epoch    14: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  15


0.14884790778160095 0.12515547020094736
Epoch:  16


0.1485971800378851 0.12422718214137214
Epoch:  17


0.14762806248020482 0.12409233514751707
Epoch:  18


0.1476761267797367 0.12379961035081319
Epoch:  19


0.14643326764171188 0.1233145147562027
Epoch:  20


0.14740088783405922 0.1239924430847168
Epoch:  21


0.14734542812850024 0.12291497098548072
Epoch:  22


0.1464516045273961 0.12322245006050382
Epoch:  23


0.14679880842969223 0.1228846822466169
Epoch:  24


0.14611424344616966 0.12285863714558738
Epoch:  25


0.1461131443848481 0.12388030226741518
Epoch:  26


0.1465690651455441 0.12279039940663747
Epoch:  27


0.14578244613634572 0.12276969637189593
Epoch:  28


0.14505640598567757 0.122239031961986
Epoch:  29


0.14508785186587153 0.12253225062574659
Epoch:  30


0.14520225291316574 0.12213392875024251
Epoch:  31


0.1455722418991295 0.12286548635789327
Epoch:  32


0.1458986566678898 0.12220763521535057
Epoch:  33


0.14560201039185394 0.1230673183287893
Epoch:  34


0.1465829349047429 0.12207803768771035
Epoch:  35


0.14410564303398132 0.12328519352844783
Epoch:  36


0.14526843621924118 0.12186684033700398
Epoch:  37


0.14381629993786682 0.12182910314628057
Epoch:  38


0.1447624604444246 0.12234483659267426
Epoch:  39


0.14337476886607506 0.12169237754174642
Epoch:  40


0.14450153426544085 0.1218455114534923
Epoch:  41


0.14351182088658615 0.12185767399413246
Epoch:  42


0.1434857647966694 0.12175503905330386
Epoch:  43


0.14342423991577044 0.1238041775567191
Epoch:  44


0.14429127324271845 0.12168664378779274
Epoch:  45


0.14351405807443568 0.12219056487083435
Epoch    45: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  46


0.14313961686314763 0.12158623763493129
Epoch:  47


0.14366276642760714 0.12159333271639687
Epoch:  48


0.14338626249416456 0.12158599176577159
Epoch:  49


0.14304186001017288 0.12138080916234426
Epoch:  50


0.14279559697653796 0.12142708258969444
Epoch:  51


0.1427843292822709 0.12154374484504972
Epoch:  52


0.14335506469816775 0.12161296393190112
Epoch:  53


0.14243743790162577 0.12133614080292839
Epoch:  54


0.14355181278409185 0.12128067016601562
Epoch:  55


0.1433808582054602 0.12129165977239609
Epoch:  56


0.1426544036414172 0.1212223938533238
Epoch:  57


0.14225003364923838 0.12129511684179306
Epoch:  58


0.1417825693214262 0.12142830129180636
Epoch:  59


0.1421134399401175 0.12162817376000541
Epoch:  60


0.14396516052452293 0.12096403752054487
Epoch:  61


0.1419262805500546 0.12115181982517242
Epoch:  62


0.14317388389561628 0.12128113635948726
Epoch:  63


0.1426612928106978 0.12121472401278359
Epoch:  64


0.14199864824075956 0.12133979690926415
Epoch:  65


0.1427577560817873 0.12122493450130735
Epoch:  66


0.14402006324884054 0.1211302535874503
Epoch    66: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  67


0.1430338516428664 0.12106756546667644
Epoch:  68


0.14249383396393545 0.12112654426268168
Epoch:  69


0.1421132454195538 0.12138418214661735
Epoch:  70


0.14258649341157964 0.12142844498157501
Epoch:  71


0.1426459397818591 0.1213337310722896
Epoch:  72


0.1414786468486528 0.12132007096494947
Epoch    72: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  73


0.14227561974847638 0.1211475527712277
Epoch:  74


0.143242611272915 0.12131479276078087
Epoch:  75


0.14210377633571625 0.12129912099667958
Epoch:  76


0.14256872277002078 0.12124329486063548
Epoch:  77


0.14063119324477943 0.12125873246363231
Epoch:  78


0.14260141430674372 0.12122957408428192
Epoch    78: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  79


0.14321737595506617 0.12107919582298823
Epoch:  80


0.1423168355548704 0.12120700840439115
Epoch:  81


0.14103495390028567 0.12118018524987358
Epoch:  82


0.14208485427740458 0.12116613558360509
Epoch:  83


0.1417969682732144 0.1213882544210979
Epoch:  84


0.14123392548110034 0.12124389729329518
Epoch:  85
