In [1]:
# Parameters
until_x = 16


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [12]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [13]:
# Instantiate the model
model = Task5Model(31).to(device)

In [14]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6094617231472118 0.42333041770117624
Epoch:  1


0.27938049548381083 0.19202078453132085
Epoch:  2


0.17046774319700292 0.1635352862732751
Epoch:  3


0.16151038376060692 0.13838967787367956
Epoch:  4


0.15775419731397886 0.13096287207944052
Epoch:  5


0.15542793233652372 0.13295517755406244
Epoch:  6


0.15473841050186674 0.12971591523715428
Epoch:  7


0.15284602948137233 5.273292234965733
Epoch:  8


0.1529279149867393 0.13330000851835525
Epoch:  9


0.15356513495380814 0.12721904047897883
Epoch:  10


0.1520688944571727 0.13026167984519685
Epoch:  11


0.1499018520116806 0.12689046561717987
Epoch:  12


0.15045683408105695 0.12638782603400095
Epoch:  13


0.15017865195467667 0.13067547444786345
Epoch:  14


0.1498581005914791 0.12665553497416632
Epoch:  15


0.14899768781017614 0.12696515023708344
Epoch:  16


0.1486371485768138 0.13364975260836737
Epoch:  17


0.14980189582786044 0.12574476535831178
Epoch:  18


0.14829694258200155 0.1283635635461126
Epoch:  19


0.14835689357809118 0.13282443157264165
Epoch:  20


0.14759228358397614 0.12893447705677577
Epoch:  21


0.14791487560078903 0.12456866034439631
Epoch:  22


0.14807897846441012 0.12540439516305923
Epoch:  23


0.14812724171458064 0.1307165569492749
Epoch:  24


0.14728325122111552 0.12551027642829077
Epoch:  25


0.14651774111631755 0.12488422010626112
Epoch:  26


0.14544205045377886 0.1313698451433863
Epoch:  27


0.14704468846321106 0.12594120523759297
Epoch    27: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  28


0.1443574158726512 0.12220769269125802
Epoch:  29


0.14340496304872874 0.12497685104608536
Epoch:  30


0.1436522756073926 0.1226216150181634
Epoch:  31


0.14247062600947716 0.12287767976522446
Epoch:  32


0.1435085136342693 0.12263760289975575
Epoch:  33


0.14215834559621038 0.12343791446515492
Epoch:  34


0.1414495489081821 0.12071294763258525
Epoch:  35


0.14113435431106672 0.12077163585594722
Epoch:  36


0.14107455796486623 0.121628038585186
Epoch:  37


0.14162147447869583 0.12059362551995687
Epoch:  38


0.1430586483832952 0.12106468634946006
Epoch:  39


0.14131672922018412 0.12028781750372478
Epoch:  40


0.1411352201893523 0.12025097757577896
Epoch:  41


0.1417214427445386 0.11959820985794067
Epoch:  42


0.14146952894893852 0.12092104128428868
Epoch:  43


0.14190573345970464 0.12104223029954093
Epoch:  44


0.14108438950938149 0.12068186274596623
Epoch:  45


0.14022513640088005 0.1202102324792317
Epoch:  46


0.1397950544550612 0.12024027215582984
Epoch:  47


0.1397252288219091 0.12115920654364995
Epoch    47: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  48


0.1400880273934957 0.12075659206935338
Epoch:  49


0.13956016584022626 0.12059199384280614
Epoch:  50


0.13989034899183223 0.12068441616637367
Epoch:  51


0.1405943527414992 0.12071641108819417
Epoch:  52


0.13901451612646515 0.12069546111992427
Epoch:  53


0.14102690082949562 0.12056290464741844
Epoch    53: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  54


0.14062373259583036 0.12068121667419161
Epoch:  55


0.14007597638143077 0.1205586503658976
Epoch:  56


0.14001984692908623 0.12067589695964541
Epoch:  57


0.14050886598793236 0.12080253660678864
Epoch:  58


0.13878761795726982 0.12048430527959551
Epoch:  59


0.139803666118029 0.12073446278061185
Epoch    59: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  60


0.1387504989633689 0.12052228408200401
Epoch:  61


0.14005631933341156 0.1206269594175475
Epoch:  62


0.1401509731202512 0.12059963600976127
Epoch:  63


0.1401432337793144 0.1206415104014533
Epoch:  64


0.14003159790425687 0.12071514342512403
Epoch:  65


0.1393224802371618 0.12071021646261215
Epoch    65: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  66
