In [1]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [2]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [3]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [4]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [5]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [6]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [7]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [8]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [9]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [10]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [11]:
until_x = None

In [12]:
# Parameters
until_x = 4


In [13]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [14]:
# Instantiate the model
model = Task5Model(31).to(device)

In [15]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [16]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6274407216020532 0.4804107631955828
Epoch:  1


0.32147796613139074 0.19688884913921356
Epoch:  2


0.18528950899033933 0.165086458836283
Epoch:  3


0.17341609258909482 0.15200695608343398
Epoch:  4


0.16937039107889743 0.15994561995778764
Epoch:  5


0.16582376932775653 0.14824655864919936
Epoch:  6


0.16306322933854284 0.17062412521668843
Epoch:  7


0.1626414862033483 0.19437651123319352
Epoch:  8


0.16195860263463613 0.1448373453957694
Epoch:  9


0.1600782106051574 0.16599186935595103
Epoch:  10


0.15943046800188115 0.13690758390086039
Epoch:  11


0.1601067640491434 0.13843637066228048
Epoch:  12


0.15828799073760574 0.13437777012586594
Epoch:  13


0.15826809728467786 0.1339823454618454
Epoch:  14


0.1574308558090313 0.1389517475451742
Epoch:  15


0.1573858631623758 0.133154459297657
Epoch:  16


0.157177041914012 0.13324858035360063
Epoch:  17


0.156441060272423 0.13486036764723913
Epoch:  18


0.1556719384483389 0.1316371155636651
Epoch:  19


0.1554917503853102 0.12973354863268988
Epoch:  20


0.15497101641990044 0.13178526823009765
Epoch:  21


0.15475562617585464 0.1330449485353061
Epoch:  22


0.15349423442337964 0.12983973643609456
Epoch:  23


0.15273983736295957 0.13061882661921637
Epoch:  24


0.1531238535771499 0.12731516999857767
Epoch:  25


0.15258392570792018 0.13040764204093389
Epoch:  26


0.15345638909855405 0.12869362852403096
Epoch:  27


0.15265341948818517 0.12700297577040537
Epoch:  28


0.1524652311125317 0.13022144032376154
Epoch:  29


0.1516188089106534 0.12886242674929754
Epoch:  30


0.15090154916853518 0.1283709481358528
Epoch:  31


0.15079644282121915 0.12589319795370102
Epoch:  32


0.15088633870756304 0.13062035398823874
Epoch:  33


0.1518692668225314 0.12603530819926942
Epoch:  34


0.15041940478054253 0.12768169811793736
Epoch:  35


0.15066998069350784 0.1265050885932786
Epoch:  36


0.15016155992005323 0.13817941290991648
Epoch:  37


0.15130427520017367 0.126268384712083
Epoch    37: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  38


0.14893763492236267 0.1233669753585543
Epoch:  39


0.14820791136574102 0.1225055062345096
Epoch:  40


0.1462509914024456 0.12242979343448367
Epoch:  41


0.14654752008012822 0.1222945409161704
Epoch:  42


0.14645360853220965 0.12214591354131699
Epoch:  43


0.14607198697489662 0.12230352631637029
Epoch:  44


0.1464892911749917 0.1221253222652844
Epoch:  45


0.14626284064473333 0.12240672005074364
Epoch:  46


0.14591285264169848 0.12235978990793228
Epoch:  47


0.14507297486872286 0.12211191122020994
Epoch:  48


0.14696172403322683 0.12223287671804428
Epoch:  49


0.1466060453975523 0.12230658105441503
Epoch:  50


0.14463145386528325 0.12191636860370636
Epoch:  51


0.14485769014100772 0.12236473283597402
Epoch:  52


0.14523475717853856 0.12242988922766276
Epoch:  53


0.14591443780306224 0.12242335719721657
Epoch:  54


0.14551719378780675 0.12214360811880656
Epoch:  55


0.14529447901893305 0.12231462448835373
Epoch:  56


0.14544195863040718 0.1222747660108975
Epoch    56: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  57


0.14432966185582652 0.1223545681153025
Epoch:  58


0.14416597018370758 0.12206530570983887
Epoch:  59


0.14498136454337351 0.12221860672746386
Epoch:  60


0.14514626320954915 0.12206404336861201
Epoch:  61


0.14562171294882492 0.12208123824426106
Epoch:  62


0.14493906659048958 0.12194396129676274
Epoch    62: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  63


0.14397704762381477 0.12206003495625087
Epoch:  64


0.14458025508635752 0.12206848072154182
Epoch:  65


0.14555618126650113 0.12184906112296241
Epoch:  66


0.1443473378548751 0.12206167727708817
Epoch:  67


0.14558555346888466 0.12198722256081444
Epoch:  68


0.14466130652943174 0.1221042594739369
Epoch:  69


0.14490476049281456 0.12198980471917562
Epoch:  70


0.14566956943756826 0.12192883768251964
Epoch:  71


0.1454055321377677 0.12211208045482635
Epoch    71: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  72


0.14527585256744074 0.12211699783802032
Epoch:  73


0.14483629851727872 0.12201911636761256
Epoch:  74


0.14510233962052577 0.12190574620451246
Epoch:  75


0.1445979846490396 0.12170089142663139
Epoch:  76


0.14585009217262268 0.12202662016664233
Epoch:  77


0.14388886980108312 0.12199872093541282
Epoch:  78


0.14546403047200795 0.12184774024145943
Epoch:  79


0.14493556723401352 0.12194398151976722
Epoch:  80


0.1454905988396825 0.12212028567280088
Epoch:  81


0.14524110708687757 0.1219889691897801
Epoch    81: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  82


0.14545243173032193 0.12184244713612966
Epoch:  83


0.1443453326418593 0.12220218671219689
Epoch:  84


0.1452922527049039 0.12215725013187953
Epoch:  85


0.14465179314484466 0.1220047122665814
Epoch:  86


0.14537435367300705 0.12171992978879384
Epoch:  87


0.14435213562604543 0.12201554647513799
Epoch:  88


0.1454100745755273 0.1219755592090743
Epoch:  89


0.14560517467357018 0.12206019354718071
Epoch:  90


0.14454293935685544 0.12203266897371837
Epoch:  91


0.1448512137741656 0.12220344479594912
Epoch:  92


0.14387961013897047 0.12198946944304875
Epoch:  93


0.14428060240036733 0.12194366646664483
Epoch:  94


0.14449779568491755 0.1220456308552197
Epoch:  95


0.1441286097507219 0.12199923183236804
Epoch:  96


0.14503801553635984 0.12187135113137108
Epoch:  97


0.14445803012396838 0.12188044296843666
Epoch:  98


0.14609013618649663 0.12215987805809293
Epoch:  99


0.14485388792849876 0.12225891862596784
