In [1]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [2]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [3]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [4]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [5]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [6]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [7]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [8]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [9]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [10]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [11]:
until_x = None

In [12]:
# Parameters
until_x = 15


In [13]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [14]:
# Instantiate the model
model = Task5Model(31).to(device)

In [15]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [16]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6311801720309902 0.4135802047593253
Epoch:  1


0.32090152961176793 0.20138237306049891
Epoch:  2


0.19215944892651327 0.17752799178872788
Epoch:  3


0.18402060020614314 0.18340921827725
Epoch:  4


0.18090508354676738 0.1759882697037288
Epoch:  5


0.17999564191779574 0.18001954470361983
Epoch:  6


0.17832640336977468 0.16251521025385177
Epoch:  7


0.17719392478466034 0.16514539292880467
Epoch:  8


0.17683431465883512 0.1632089763879776
Epoch:  9


0.17645234997208054 0.16018783833299363
Epoch:  10


0.1754934320578704 0.1603434830904007
Epoch:  11


0.17414697239527832 0.16100412820066726
Epoch:  12


0.17374613277010015 0.15945901615279062
Epoch:  13


0.17381354763701157 0.1623950196163995
Epoch:  14


0.1745188566478523 0.1597930874143328
Epoch:  15


0.17317967922300906 0.1600602035011564
Epoch:  16


0.1718515097289472 0.16621635002749308
Epoch:  17


0.1710455904941301 0.16218975824969156
Epoch:  18


0.16895602522669612 0.15713575269494737
Epoch:  19


0.16992040139597817 0.15759530876364028
Epoch:  20


0.16815414380382848 0.17601698424134934
Epoch:  21


0.16761013623830434 0.1607295274734497
Epoch:  22


0.16791184085446434 0.1485932171344757
Epoch:  23


0.16743791425550306 0.14865519106388092
Epoch:  24


0.1657369394560118 0.14293557831219264
Epoch:  25


0.16597147447031899 0.1471481238092695
Epoch:  26


0.16531439649092183 0.14745743146964482
Epoch:  27


0.16371920020193667 0.1447945237159729
Epoch:  28


0.1647500198435139 0.14840373822620936
Epoch:  29


0.16239888160615354 0.14014648539679392
Epoch:  30


0.16265634588293126 0.14402242856366293
Epoch:  31


0.16399597598088755 0.14427024445363454
Epoch:  32


0.16165097782740723 0.14203417833362306
Epoch:  33


0.16239734195374153 0.14720432673181807
Epoch:  34


0.16193252399161057 0.14584309181996755
Epoch:  35


0.1618985219581707 0.1477523616382054
Epoch    35: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  36


0.1592306478603466 0.1342398918100766
Epoch:  37


0.1593229432363768 0.13484977505036763
Epoch:  38


0.15993998703118917 0.1332962459751538
Epoch:  39


0.1574247447220055 0.13370110520294734
Epoch:  40


0.15848609764833707 0.1337037490946906
Epoch:  41


0.15929080666722478 0.13377671369484492
Epoch:  42


0.1584133594422727 0.1336665696331433
Epoch:  43


0.15755515122735822 0.13328328622238977
Epoch:  44


0.15790177519257004 0.13290213899953024
Epoch:  45


0.15817535608201413 0.13276143691369466
Epoch:  46


0.1572032840670766 0.13241936159985407
Epoch:  47


0.1566014491223 0.13235081093651907
Epoch:  48


0.158396345538062 0.13275599905422755
Epoch:  49


0.15845249512711088 0.13309850543737411
Epoch:  50


0.15651734938492645 0.1328920147248677
Epoch:  51


0.157114168679392 0.132527870791299
Epoch:  52


0.1578594804615588 0.13264907683644975
Epoch:  53


0.15716747416032328 0.13314239893640792
Epoch    53: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  54


0.1579988933092839 0.13310707147632325
Epoch:  55


0.15785767742105433 0.13263966462441854
Epoch:  56


0.15727910318890134 0.1325274035334587
Epoch:  57


0.15674120066939173 0.1327592773096902
Epoch:  58


0.15671106850778735 0.13243968039751053
Epoch:  59


0.1574661606872404 0.13251182117632457
Epoch    59: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  60


0.15686204546206706 0.13238848639386042
Epoch:  61


0.15711643526682983 0.1322575339249202
Epoch:  62


0.15666582858240283 0.13261617720127106
Epoch:  63


0.1578426787982116 0.13237735629081726
Epoch:  64


0.15642808740203445 0.13211675201143538
Epoch:  65


0.1565670048868334 0.1322202788931983
Epoch:  66


0.15702712415037928 0.13250886968203954
Epoch:  67


0.15723882857206706 0.13248168677091599
Epoch:  68


0.1563278158774247 0.13244730340582983
Epoch:  69


0.1561019078299806 0.13238049192087992
Epoch:  70


0.1580278595557084 0.13200351915189198
Epoch:  71


0.15710537618881948 0.13223116844892502
Epoch:  72


0.15651379284021016 0.13254861320768083
Epoch:  73


0.1565565011791281 0.13188825121947698
Epoch:  74


0.15770798518850998 0.13227699909891402
Epoch:  75


0.1558560978721928 0.13219664352280752
Epoch:  76


0.15766066634977186 0.1324720403977803
Epoch:  77


0.15662711174101443 0.13252664783171245
Epoch:  78


0.15623260953941862 0.13222092070749827
Epoch:  79


0.15759188379790331 0.13227762281894684
Epoch    79: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  80


0.15702215481448817 0.13230169351611817
Epoch:  81


0.1564918679965509 0.13218361352171218
Epoch:  82


0.15772378001664136 0.13182847840445383
Epoch:  83


0.15693759273838354 0.13205042162111827
Epoch:  84


0.15785103271136414 0.13202013288225448
Epoch:  85


0.15670164211376295 0.13230246411902563
Epoch:  86


0.15677309962543282 0.1320180967450142
Epoch:  87


0.15771547926438823 0.1325560478227479
Epoch:  88


0.1558876130226496 0.1323231660893985
Epoch    88: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  89


0.1569522808532457 0.13226588602576936
Epoch:  90


0.15692302262460864 0.1323132546884673
Epoch:  91


0.15690438610476418 0.13260406681469508
Epoch:  92


0.1565233803278691 0.1319419699055808
Epoch:  93


0.15708030236733928 0.13181978038379125
Epoch:  94


0.15695712171696327 0.13220609830958502
Epoch:  95


0.15735931734781008 0.1318295385156359
Epoch:  96


0.1574922969212403 0.13202833277838572
Epoch:  97


0.1571486882261328 0.13277130254677363
Epoch:  98


0.15654453111661448 0.1321384459733963
Epoch:  99


0.15752948216489843 0.13214053639343806
