In [1]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [2]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [3]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [4]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [5]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [6]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [7]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [8]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [9]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [10]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [11]:
until_x = None

In [12]:
# Parameters
until_x = 12


In [13]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [14]:
# Instantiate the model
model = Task5Model(31).to(device)

In [15]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [16]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6373038791321419 0.48155745438167025
Epoch:  1


0.33090578905633977 0.20369982506547654
Epoch:  2


0.1936631263107867 0.19722892343997955
Epoch:  3


0.18166015397857976 0.16938648053577968
Epoch:  4


0.1777418759223577 0.1692062850509371
Epoch:  5


0.1754627884239764 0.15757490268775395
Epoch:  6


0.17492134506637985 0.1594044097832271
Epoch:  7


0.17319637980010058 0.15828172436782292
Epoch:  8


0.1724019803710886 0.156734613435609
Epoch:  9


0.17024299139912064 0.1644820613520486
Epoch:  10


0.16998517231361285 0.1499431026833398
Epoch:  11


0.16885937750339508 0.16011387961251394
Epoch:  12


0.16941162984113436 0.15473877957889012
Epoch:  13


0.1656465759953937 0.16141036578587123
Epoch:  14


0.16742717897569812 0.1506877158369337
Epoch:  15


0.16453789094009916 0.14397967819656646
Epoch:  16


0.16541373608885585 0.1458765917590686
Epoch:  17


0.16481443514695038 0.14132797185863769
Epoch:  18


0.1634530610329396 0.1440062459026064
Epoch:  19


0.16228633998213587 0.14491043984889984
Epoch:  20


0.16152517215625659 0.13507182789700373
Epoch:  21


0.16264321836265358 0.1349599872316633
Epoch:  22


0.16017915630662763 0.13440723610775812
Epoch:  23


0.15881078025779208 0.13662414039884294
Epoch:  24


0.16066441262090528 0.13723568618297577
Epoch:  25


0.1582911509919811 0.13712769533906663
Epoch:  26


0.15920303480045214 0.13469730956213816
Epoch:  27


0.15791612702447014 0.13266630257878984
Epoch:  28


0.15744761156069265 0.13280786680323736
Epoch:  29


0.15664942925040787 0.13347019468035018
Epoch:  30


0.15698352738006696 0.1318722399217742
Epoch:  31


0.15717120186702624 0.13135523987667902
Epoch:  32


0.15695221158298286 0.12973274609872273
Epoch:  33


0.15653186593506788 0.13008985136236464
Epoch:  34


0.1570210416574736 0.23685111105442047
Epoch:  35


0.15648975042072502 0.12955917843750545
Epoch:  36


0.15682005560075915 0.12957494599478586
Epoch:  37


0.1560484714604713 0.1297604739665985
Epoch:  38


0.15567706001771464 0.12899078322308405
Epoch:  39


0.15492910227260073 0.13049245732171194
Epoch:  40


0.15600108335147034 0.13123060230697906
Epoch:  41


0.15447297611752073 0.12901732751301356
Epoch:  42


0.15509949141257517 0.13153075959001267
Epoch:  43


0.1538546971372656 0.1284374605332102
Epoch:  44


0.15492164806739703 0.13202371980462754
Epoch:  45


0.1526745706796646 0.13019908219575882
Epoch:  46


0.1525096977884705 0.12774354538747243
Epoch:  47


0.15493001768717896 0.12882672676018306
Epoch:  48


0.1536237843133308 0.13280421389000757
Epoch:  49


0.15340746214260925 0.1276817853961672
Epoch:  50


0.1523570599588188 0.12908018699714116
Epoch:  51


0.15300204625000824 0.130307813840253
Epoch:  52


0.15309246327425982 0.12982729715960367
Epoch:  53


0.15348624337363886 0.12805901148489543
Epoch:  54


0.15134239720331655 0.12967196319784438
Epoch:  55


0.1530619787203299 0.12636830338409968
Epoch:  56


0.15308562041939916 0.12967582472733089
Epoch:  57


0.1520626295257259 0.13097204267978668
Epoch:  58


0.1518719192292239 0.12615944764443807
Epoch:  59


0.15142342810695236 0.13094645206417357
Epoch:  60


0.15133556804141482 0.12873149982520513
Epoch:  61


0.1516491468693759 0.1291573228580611
Epoch:  62


0.1523281235147167 0.1270857634288924
Epoch:  63


0.15143095883163246 0.1276685010109629
Epoch:  64


0.1517187965077323 0.12930037932736532
Epoch    64: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  65


0.14879327609732346 0.12473562891994204
Epoch:  66


0.14975084726874893 0.12439555674791336
Epoch:  67


0.14874173298075394 0.1240063128726823
Epoch:  68


0.14807940697347796 0.12439145679984774
Epoch:  69


0.14797961027235598 0.1237260348030499
Epoch:  70


0.14826125269000595 0.12322935887745448
Epoch:  71


0.14877431940388036 0.12380760801689965
Epoch:  72


0.14766115554281184 0.123438261449337
Epoch:  73


0.14836799252677607 0.12344471258776528
Epoch:  74


0.14675729379460617 0.12318251175539834
Epoch:  75


0.14769782810597806 0.12318466284445354
Epoch:  76


0.14781650135645996 0.12358732202223369
Epoch:  77


0.1478730697889586 0.12333724754197258
Epoch:  78


0.1464564453911137 0.12337935396603175
Epoch:  79


0.14655604314159704 0.12353026334728513
Epoch:  80


0.14608079597756668 0.12312497837202889
Epoch:  81


0.1480941144195763 0.12362077193600791
Epoch:  82


0.14702568545534805 0.12354179684604917
Epoch:  83


0.14639556971756187 0.12330836057662964
Epoch:  84


0.14843563292477582 0.12396264395543508
Epoch:  85


0.14711146217745705 0.12370853551796504
Epoch:  86


0.14739082713384885 0.1239456438592502
Epoch    86: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  87


0.1465835994159853 0.12393966210739953
Epoch:  88


0.1484426815767546 0.123738665665899
Epoch:  89


0.14681215866191968 0.12368545787675041
Epoch:  90


0.14665340088509224 0.12389714483703886
Epoch:  91


0.1473271299052883 0.12323720114571708
Epoch:  92


0.14619546363482605 0.1235898786357471
Epoch    92: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  93


0.14710452911016103 0.12347941313471113
Epoch:  94


0.14676404402062698 0.12349714658090047
Epoch:  95


0.14705611684837858 0.12404446516718183
Epoch:  96


0.14744577375618187 0.12394483493907112
Epoch:  97


0.1466513300264204 0.12414673822266716
Epoch:  98


0.14691741120170904 0.12372887986046928
Epoch    98: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  99


0.14550314521467364 0.12380817426102501
