In [1]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor



In [2]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [3]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [4]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [5]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [6]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [7]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [8]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [9]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [10]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [11]:
until_x = None

In [12]:
# Parameters
until_x = 7


In [13]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [14]:
# Instantiate the model
model = Task5Model(31).to(device)

In [15]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [16]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6324158149796564 0.5159895079476493
Epoch:  1


0.3279704717365471 0.23263622181756155
Epoch:  2


0.19091470297929403 0.21951409110001155
Epoch:  3


0.17755131302653132 0.1682915346963065
Epoch:  4


0.17523283007982615 0.16291944895471847
Epoch:  5


0.17086904516091217 0.16568472342831747
Epoch:  6


0.1701830645670762 0.14804136327334813
Epoch:  7


0.1684192157274968 0.1532456193651472
Epoch:  8


0.16605765594018473 0.142414713544505
Epoch:  9


0.16485218099645665 0.14281580384288514
Epoch:  10


0.16418715787900462 0.14000738944326127
Epoch:  11


0.16247809335992142 0.14130262498344695
Epoch:  12


0.16197772042171374 0.13940586681876863
Epoch:  13


0.16202825187025843 0.13621095142194203
Epoch:  14


0.1611532613232329 0.13667358777352742
Epoch:  15


0.16028246041890737 0.13766642872776305
Epoch:  16


0.15892779424383835 0.13444241881370544
Epoch:  17


0.1591749654428379 0.1354482610310827
Epoch:  18


0.1584233363737931 0.1393434309533664
Epoch:  19


0.15943028354966962 0.13554776459932327
Epoch:  20


0.15824943861445864 0.1334932478410857
Epoch:  21


0.15959452294014595 0.1353220875774111
Epoch:  22


0.1578488845277477 0.13343626580068044
Epoch:  23


0.15782146759935328 0.13252865097352437
Epoch:  24


0.15630683504246376 0.13464297779968806
Epoch:  25


0.1560844992463653 0.13158883154392242
Epoch:  26


0.15713263524545207 0.13372262035097396
Epoch:  27


0.1550277423214268 0.12968534976243973
Epoch:  28


0.15666305978555936 0.1373737965311323
Epoch:  29


0.15535393921104637 0.131537387413638
Epoch:  30


0.15471595364647941 0.13051399375711167
Epoch:  31


0.15490025684640213 0.12940976449421474
Epoch:  32


0.1541106632432422 0.1312112637928554
Epoch:  33


0.1537292498994518 0.13147488555737905
Epoch:  34


0.15460540555618904 0.13050653572593415
Epoch:  35


0.15353033671507965 0.12905682516949518
Epoch:  36


0.15254990595418053 0.1280233668429511
Epoch:  37


0.1538156323336266 0.12738584727048874
Epoch:  38


0.15229557212945577 0.12841867016894476
Epoch:  39


0.15301698446273804 0.12978848282779967
Epoch:  40


0.1532050044955434 0.3818930398140635
Epoch:  41


0.15507368139318517 0.13409744096653803
Epoch:  42


0.15294725105569168 0.12979226452963694
Epoch:  43


0.15250662210825328 0.12935681641101837
Epoch    43: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  44


0.15072440859433767 0.12648478043930872
Epoch:  45


0.1504217426519136 0.12576083945376532
Epoch:  46


0.14965117058238467 0.12594625034502574
Epoch:  47


0.1500706322289802 0.1253927211676325
Epoch:  48


0.1489728089119937 0.12491057600293841
Epoch:  49


0.15029972993038795 0.1252322005374091
Epoch:  50


0.14986065271738414 0.1254620307258197
Epoch:  51


0.14877224129599495 0.12526382612330572
Epoch:  52


0.14858788090783195 0.12517789219106948
Epoch:  53


0.14763368223164533 0.12522258928843907
Epoch:  54


0.1482137820205173 0.12550249802214758
Epoch    54: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  55


0.14874690771102905 0.12537423414843424
Epoch:  56


0.1484888232237584 0.12584026477166585
Epoch:  57


0.14805892834792267 0.12498573958873749
Epoch:  58


0.1478037463652121 0.12504551985434123
Epoch:  59


0.14725059270858765 0.12473090312310628
Epoch:  60


0.14831545989255648 0.12526013808591024
Epoch:  61


0.1473823942848154 0.12466515600681305
Epoch:  62


0.14780465211417224 0.12462716762508665
Epoch:  63


0.1483323046484509 0.1249482131430081
Epoch:  64


0.1485001565636815 0.12475549961839404
Epoch:  65


0.14638332056032644 0.1247135294335229
Epoch:  66


0.14915484069166957 0.12566579771893366
Epoch:  67


0.14801034089681264 0.12472903196300779
Epoch:  68


0.14862739032990224 0.12472367818866457
Epoch    68: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  69


0.14742244699516813 0.12480227436338152
Epoch:  70


0.14866968868552027 0.12535103197608674
Epoch:  71


0.1485376043899639 0.12511240158762252
Epoch:  72


0.14818279002163862 0.12497233599424362
Epoch:  73


0.146915465593338 0.12465627597911018
Epoch:  74


0.14844973707521283 0.12455657443829946
Epoch:  75


0.14895117766148336 0.12477904771055494
Epoch:  76


0.1481829752793183 0.12499208535466876
Epoch:  77


0.1490248908867707 0.12461878465754646
Epoch:  78


0.14722333083281647 0.1248642270054136
Epoch:  79


0.14868569333811063 0.12492668841566358
Epoch:  80


0.14836277067661285 0.12499669513532094
Epoch    80: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  81


0.14788238825024785 0.12475319419588361
Epoch:  82


0.14835404987270767 0.12485270947217941
Epoch:  83


0.14926078754502373 0.1251357729945864
Epoch:  84


0.1471259795330666 0.12455861696175166
Epoch:  85


0.14741718406612808 0.12459272678409304
Epoch:  86


0.14864061772823334 0.12480678196464266
Epoch    86: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  87


0.14803156337222537 0.12469476993594851
Epoch:  88


0.14855243547542676 0.12468676481928144
Epoch:  89


0.14765866664615837 0.12471330059426171
Epoch:  90


0.14868823056285446 0.12488202644245965
Epoch:  91


0.1486681599069286 0.1248341394322259
Epoch:  92


0.14809586872925629 0.12475781249148506
Epoch:  93


0.14789279167716568 0.12434653724942889
Epoch:  94


0.148140918161418 0.12513171562126704
Epoch:  95


0.14983275975729968 0.1247974272285189
Epoch:  96


0.14929051294520096 0.12479495044265475
Epoch:  97


0.14873107824776624 0.12477561299289976
Epoch:  98


0.14790375852907026 0.12475085258483887
Epoch:  99


0.14689822978264577 0.12449246006352561
