In [1]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor



In [2]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [3]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [4]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [5]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [6]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [7]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [8]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [9]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [10]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [11]:
until_x = None

In [12]:
# Parameters
until_x = 15


In [13]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [14]:
# Instantiate the model
model = Task5Model(31).to(device)

In [15]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [16]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6366179488800667 0.5047373431069511
Epoch:  1


0.3346746616266869 0.2080968597105571
Epoch:  2


0.19380414566478213 0.18476330382483347
Epoch:  3


0.18082962889929075 0.17211373788969858
Epoch:  4


0.17844997990775752 0.1655834253345217
Epoch:  5


0.1760658649979411 0.16330471421991075
Epoch:  6


0.17400909879723112 0.15747069035257613
Epoch:  7


0.1720872867751766 0.40980849308626993
Epoch:  8


0.17222539556993022 0.16136924283845083
Epoch:  9


0.1721716140573089 0.159799690757479
Epoch:  10


0.16877508243998965 0.15309789989675796
Epoch:  11


0.16897506649429733 0.1536285557917186
Epoch:  12


0.1678604044624277 0.1513529462473733
Epoch:  13


0.16887349574952512 0.14639783118452346
Epoch:  14


0.16528452409280314 0.14680579943316324
Epoch:  15


0.16582942894987157 0.14400465999330794
Epoch:  16


0.165071231690613 0.14010590208428247
Epoch:  17


0.16397004433580348 0.14366847383124487
Epoch:  18


0.1634276601913813 0.14961982837745122
Epoch:  19


0.16310157203996503 0.14358266017266683
Epoch:  20


0.16227651971417503 0.13658448947327478
Epoch:  21


0.16336425615323558 0.14320088497229985
Epoch:  22


0.16184299741242383 0.14209061222417013
Epoch:  23


0.1600457675553657 0.14214876081262315
Epoch:  24


0.16104555975746465 0.13613259473017283
Epoch:  25


0.16010157602864342 0.14030819492680685
Epoch:  26


0.15938944429964633 0.1513168535062245
Epoch:  27


0.1599954717062615 0.13725300984723227
Epoch:  28


0.15955176506493543 0.1396702943103654
Epoch:  29


0.15931333520927946 0.13637488335371017
Epoch:  30


0.15993345508704315 0.1387531022940363
Epoch    30: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  31


0.1580639707075583 0.13179536483117513
Epoch:  32


0.1571564746869577 0.1310941002198628
Epoch:  33


0.15478779778287216 0.13039959647825786
Epoch:  34


0.15591286485259598 0.13023204143558229
Epoch:  35


0.1556333392053037 0.1305841984493392
Epoch:  36


0.15441141136594722 0.1302939344729696
Epoch:  37


0.15535598308653445 0.13040675542184285
Epoch:  38


0.15585729681156776 0.1295285544225148
Epoch:  39


0.154150366380408 0.12965163375650132
Epoch:  40


0.15589632335546855 0.1294720396399498
Epoch:  41


0.1556405960708051 0.12915666933570588
Epoch:  42


0.15556647890322917 0.12967689016035625
Epoch:  43


0.15452135011956497 0.12949719812188829
Epoch:  44


0.1550298212347804 0.12910368612834386
Epoch:  45


0.15622187063500687 0.1296448547925268
Epoch:  46


0.15602566865650383 0.12921099896941865
Epoch:  47


0.15451419675672376 0.13006532937288284
Epoch:  48


0.1535100630811743 0.12969012984207698
Epoch:  49


0.15441418217646108 0.12955495608704432
Epoch:  50


0.15527470772330826 0.12891765045268194
Epoch:  51


0.1547866222020742 0.1294017189315387
Epoch:  52


0.15452728762819962 0.12852728899036134
Epoch:  53


0.15372652540335785 0.12932998261281423
Epoch:  54


0.15367057114034086 0.1290445476770401
Epoch:  55


0.15413031505571828 0.12856807666165487
Epoch:  56


0.1545134508931959 0.1284130715898105
Epoch:  57


0.15397202485316508 0.12848365413291113
Epoch:  58


0.15349626299497243 0.12874966859817505
Epoch:  59


0.15406497024201057 0.12788612501961844
Epoch:  60


0.15535603946930654 0.12860981055668422
Epoch:  61


0.15495365334523692 0.12871974387339183
Epoch:  62


0.15369614837942897 0.1286511304123061
Epoch:  63


0.15431126992444735 0.1287027576139995
Epoch:  64


0.15405060511988564 0.12836442781346186
Epoch:  65


0.15311317105551023 0.12908300012350082
Epoch    65: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  66


0.15334491713626966 0.1286349275282451
Epoch:  67


0.15403506401422862 0.12872679318700517
Epoch:  68


0.15439503378159292 0.12832825950213841
Epoch:  69


0.15303514535362656 0.12810897401400975
Epoch:  70


0.1528817294417201 0.1281759887933731
Epoch:  71


0.1527627352121714 0.1282069736293384
Epoch    71: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  72


0.15377942372012782 0.12798897070544107
Epoch:  73


0.15270459813040657 0.1279014391558511
Epoch:  74


0.15332353678909508 0.12832538996423995
Epoch:  75


0.1539200620876776 0.12818082102707454
Epoch:  76


0.15357472566333977 0.1281665414571762
Epoch:  77


0.1531906695784749 0.1285482719540596
Epoch    77: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  78


0.15357234429668737 0.1281216580952917
Epoch:  79


0.15273467672837748 0.12857848512274878
Epoch:  80


0.15322036678726608 0.12772242299148015
Epoch:  81


0.15363913009295593 0.12850587282861983
Epoch:  82


0.15348126598306605 0.12872428553444998
Epoch:  83


0.15273245726082776 0.12799101216452463
Epoch:  84


0.15318288231218183 0.12841411786420004
Epoch:  85


0.15385479177977587 0.1283889529960496
Epoch:  86


0.15326953820280126 0.12819573602506093
Epoch    86: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  87


0.15245545353438403 0.12806394270488194
Epoch:  88


0.15354286818890958 0.1278445582304682
Epoch:  89


0.1534620958405572 0.1283774290766035
Epoch:  90


0.1530633359342008 0.12794162226574762
Epoch:  91


0.15241318256468386 0.1278400250843593
Epoch:  92


0.1534499965004019 0.1277880764433316
Epoch:  93


0.15377073835682226 0.12807813712528773
Epoch:  94


0.15273347899720474 0.12782820633479527
Epoch:  95


0.15244565380586161 0.1281091230256217
Epoch:  96


0.1525972554007092 0.12808042551789964
Epoch:  97


0.15348942618112307 0.1277391623173441
Epoch:  98


0.15262543671839945 0.12801448787961686
Epoch:  99


0.1541708631289972 0.12822468153067998
