In [1]:
# Parameters
until_x = 0


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [12]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [13]:
# Instantiate the model
model = Task5Model(31).to(device)

Downloading: "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth" to /root/.cache/torch/checkpoints/mobilenet_v2-b0353104.pth


  0%|                                                                                                                                                                       | 0/14212972 [00:00<?, ?it/s]

  0%|▍                                                                                                                                                      | 40960/14212972 [00:00<00:48, 292288.48it/s]

  2%|██▌                                                                                                                                                   | 245760/14212972 [00:00<00:36, 383272.63it/s]

  7%|███████████▏                                                                                                                                         | 1064960/14212972 [00:00<00:24, 531779.63it/s]

 28%|██████████████████████████████████████████▎                                                                                                          | 4030464/14212972 [00:00<00:13, 753887.05it/s]

 58%|█████████████████████████████████████████████████████████████████████████████████████▉                                                              | 8249344/14212972 [00:00<00:05, 1068790.80it/s]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 14212972/14212972 [00:00<00:00, 1515163.12it/s]

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 14212972/14212972 [00:00<00:00, 19284951.16it/s]




In [14]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6079884372852944 0.5254155397415161
Epoch:  1


0.28287836707927083 0.20528893598488399
Epoch:  2


0.1882942870662019 0.17806308822972433
Epoch:  3


0.18225603449989008 0.17770727830273764
Epoch:  4


0.18053065401476784 0.17297359236649104
Epoch:  5


0.17855952921751383 0.17288664196218764
Epoch:  6


0.17857483472373034 0.17247642363820756
Epoch:  7


0.17671392133107056 0.17062753226075852
Epoch:  8


0.17690540970982732 0.177106112241745
Epoch:  9


0.17332135664450155 0.18337697216442653
Epoch:  10


0.17283222522284533 0.1717682523386819
Epoch:  11


0.17172629285503077 0.16675599345139094
Epoch:  12


0.17107647174113505 0.1824549904891423
Epoch:  13


0.16918683374250257 0.1667068600654602
Epoch:  14


0.1685956756810884 0.18049088971955435
Epoch:  15


0.168377515834731 0.1540809209857668
Epoch:  16


0.16690048011573586 0.15027654596737453
Epoch:  17


0.1661842368744515 0.1732359060219356
Epoch:  18


0.1651581030439686 0.16109696243490493
Epoch:  19


0.16574172071508458 0.15198010951280594
Epoch:  20


0.16413312022750443 0.16587895367826735
Epoch:  21


0.16338634168779528 0.1544477151972907
Epoch:  22


0.16270143438029933 0.14849679384912765
Epoch:  23


0.16330147554745544 0.14307532353060587
Epoch:  24


0.16240622143487674 0.14454696008137294
Epoch:  25


0.16327872469618515 0.14733056936945235
Epoch:  26


0.16159116456637512 0.15798955517155783
Epoch:  27


0.1591133060487541 0.1432920502764838
Epoch:  28


0.16213949950965675 0.15773677613054002
Epoch:  29


0.1609535044109499 0.14364047135625566
Epoch    29: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  30


0.1585413853864412 0.13614117354154587
Epoch:  31


0.15777184431617325 0.13508595951965877
Epoch:  32


0.15762696999150352 0.1367493537919862
Epoch:  33


0.15698259585612528 0.13515682305608476
Epoch:  34


0.15761811024433858 0.13451408807720458
Epoch:  35


0.1583295553117185 0.1369012455855097
Epoch:  36


0.15628946632952304 0.13438929617404938
Epoch:  37


0.15667545553800222 0.1339259541460446
Epoch:  38


0.1576070020327697 0.1353674094591822
Epoch:  39


0.15526790997466525 0.1388686395117215
Epoch:  40


0.15552300818868586 0.13383858970233373
Epoch:  41


0.15531146727703712 0.13365827181509563
Epoch:  42


0.15638899440700943 0.13386981189250946
Epoch:  43


0.15567280754849716 0.1335371732711792
Epoch:  44


0.15546908652460253 0.1332972709621702
Epoch:  45


0.15649780510245143 0.13634162821940013
Epoch:  46


0.15587887449844465 0.1379722569669996
Epoch:  47


0.15575557866611997 0.13705024336065566
Epoch:  48


0.15623564132162043 0.13286514580249786
Epoch:  49


0.15508683549391256 0.14318460332495825
Epoch:  50


0.15544120645200885 0.1345335617661476
Epoch:  51


0.1563223370829144 0.13771790585347585
Epoch:  52


0.15544680767768138 0.13328360127551214
Epoch:  53


0.15516043595365575 0.1338267805320876
Epoch:  54


0.15503092512891098 0.13966302360807145
Epoch    54: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  55


0.15573757000871608 0.1316644474864006
Epoch:  56


0.1549041649779758 0.13238547103745596
Epoch:  57


0.15465050855198423 0.13263874926737376
Epoch:  58


0.15279122341323542 0.13245839944907598
Epoch:  59


0.1539698843214963 0.1326300406030246
Epoch:  60


0.15546061179122408 0.13222438309873855
Epoch:  61


0.1543438261425173 0.13215823897293635
Epoch    61: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  62


0.15636676066630595 0.13239248309816634
Epoch:  63


0.154385180489437 0.1318414126123701
Epoch:  64


0.15409832306810328 0.13249690192086355
Epoch:  65


0.1553273007676408 0.1323502670441355
Epoch:  66


0.15376209930793658 0.13196228657449996
Epoch:  67


0.15610340519531354 0.13225119348083222
Epoch    67: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  68


0.1552941658206888 0.13216791727713176
Epoch:  69


0.15504439738956657 0.1320759877562523
Epoch:  70


0.15635932619507248 0.13187413343361445
Epoch:  71


0.15442676278384956 0.13217426197869436
Epoch:  72


0.15478971600532532 0.13222511857748032
Epoch:  73


0.15493373250639117 0.13216752346072877
Epoch    73: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  74


0.15425885931865588 0.13240298735243933
Epoch:  75


0.15466860460268483 0.13200634930815017
Epoch:  76


0.15489901320354357 0.13200935614960535
Epoch:  77


0.1555425053512728 0.1324087679386139
Epoch:  78


0.15489469831054276 0.13166275088276183
Epoch:  79


0.1544790412928607 0.13295135966369084
Epoch:  80


0.1562877861229149 0.13229643127747945
Epoch:  81


0.1557264315920907 0.1326218226126262
Epoch:  82


0.15538731821485469 0.13219970890453883
Epoch:  83


0.15543780012710676 0.13217516456331527
Epoch:  84


0.15553894034914068 0.13216937759092875
Epoch:  85


0.15506956867269567 0.1321081125310489
Epoch:  86


0.15530104532435135 0.13203904458454677
Epoch:  87


0.15458253267649058 0.13192077087504522
Epoch:  88


0.15405842903498057 0.13206523337534495
Epoch:  89


0.15527835286952354 0.13188911761556352
Epoch:  90


0.15483491082449216 0.13227553559201105
Epoch:  91


0.15356428075481104 0.1320470901472228
Epoch:  92


0.15498015485905312 0.13188077296529496
Epoch:  93


0.15444304693389582 0.13247001383985793
Epoch:  94


0.15476364380604513 0.1323029792734555
Epoch:  95


0.15441111213452108 0.13177599757909775
Epoch:  96


0.1550949219916318 0.13230404470648086
Epoch:  97


0.15507878725593155 0.13224426976272038
Epoch:  98


0.15514359683603854 0.1324230751820973
Epoch:  99


0.1551543699728476 0.13193979007857187
