In [1]:
# Parameters
until_x = 6


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [12]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [13]:
# Instantiate the model
model = Task5Model(31).to(device)

In [14]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6143959438478624 0.5405417425291879
Epoch:  1


0.29252703528146484 0.22170924076012202
Epoch:  2


0.19162765950769992 0.17393408715724945
Epoch:  3


0.18378116391800545 0.17138620572430746
Epoch:  4


0.1824167302331409 0.17299721283572062
Epoch:  5


0.18071387990100965 0.24163648911884852
Epoch:  6


0.17922056083743637 0.16660494889531816
Epoch:  7


0.1758134135523358 0.17091019877365657
Epoch:  8


0.1711489340743503 0.1616506746837071
Epoch:  9


0.1699258729412749 0.15156027248927526
Epoch:  10


0.16626419851908814 0.15011500035013473
Epoch:  11


0.16459337118509654 0.1482613640172141
Epoch:  12


0.1636958182663531 0.14698434514658792
Epoch:  13


0.1614828786334476 0.14266929349728993
Epoch:  14


0.161990071470673 0.15182717783110483
Epoch:  15


0.1621344109644761 0.14058079570531845
Epoch:  16


0.1627172412904533 0.1491543778351375
Epoch:  17


0.16139417927007418 0.13747635270868028
Epoch:  18


0.15939349942916148 0.14175168105534144
Epoch:  19


0.1598122840797579 0.13728406067405427
Epoch:  20


0.15769659304941022 0.13779424343790328
Epoch:  21


0.15779775421361666 0.14089846078838622
Epoch:  22


0.1574798145809689 0.13326510254825866
Epoch:  23


0.15598918054554914 0.13733615513358796
Epoch:  24


0.15656980872154236 0.13415117348943437
Epoch:  25


0.15536245945337657 0.13272906414100102
Epoch:  26


0.15552916760380203 0.13156864047050476
Epoch:  27


0.1540612658938846 0.13186100125312805
Epoch:  28


0.1540585784493266 0.1317261021052088
Epoch:  29


0.15501877383605853 0.13717056597982133
Epoch:  30


0.15495259657099442 0.13180775195360184
Epoch:  31


0.15476916448490038 0.12906659500939505
Epoch:  32


0.15404442476259694 0.13172664919069835
Epoch:  33


0.15274022318221428 0.12841368785926274
Epoch:  34


0.15282539419225744 0.12976826620953424
Epoch:  35


0.15267997778750755 0.1290107529078211
Epoch:  36


0.15265775854523117 0.132132993212768
Epoch:  37


0.152769344481262 0.1319166858281408
Epoch:  38


0.15091426952465162 0.12871678918600082
Epoch:  39


0.15109409714067304 0.12924637539046152
Epoch    39: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  40


0.1503661278937314 0.12500238631452834
Epoch:  41


0.1487866950196189 0.12495628850800651
Epoch:  42


0.14955347615319328 0.12447464359658104
Epoch:  43


0.14844396347935135 0.12456234438078743
Epoch:  44


0.14907437804582957 0.12451310668672834
Epoch:  45


0.14907671833360517 0.12428454841886248
Epoch:  46


0.1474186612947567 0.12414603573935372
Epoch:  47


0.1478209596227955 0.1244799771479198
Epoch:  48


0.1480408451847128 0.12384388808693204
Epoch:  49


0.14826355391257517 0.12405270550932203
Epoch:  50


0.14855976886040456 0.12351925245353154
Epoch:  51


0.1483308797752535 0.12360283945287977
Epoch:  52


0.14768722774209203 0.12362595008952278
Epoch:  53


0.1481416885917251 0.12390084138938359
Epoch:  54


0.1476245305022678 0.12442619992153985
Epoch:  55


0.14689515812976942 0.12351344845124654
Epoch:  56


0.1492984254617949 0.12402345559426717
Epoch    56: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  57


0.14798926944668228 0.12371817231178284
Epoch:  58


0.14882505302493637 0.12356592395475932
Epoch:  59


0.14766044914722443 0.12343272353921618
Epoch:  60


0.14772430062294006 0.12367921109710421
Epoch:  61


0.14763760647258242 0.12377611441271645
Epoch:  62


0.14689264547180486 0.12378945095198494
Epoch:  63


0.14804649353027344 0.12347602418490819
Epoch:  64


0.1483887144037195 0.123440220952034
Epoch:  65


0.14673296785032428 0.12344703397580556
Epoch    65: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  66


0.14802812442586227 0.12380470548357282
Epoch:  67


0.14771411145055616 0.12344702226775033
Epoch:  68


0.1469296386113038 0.12356262654066086
Epoch:  69


0.1468121812150285 0.12361151937927518
Epoch:  70


0.1467033032629941 0.1233245890055384
Epoch:  71


0.14695717555445595 0.12359791994094849
Epoch:  72


0.14794936695614377 0.12339210510253906
Epoch:  73


0.1468605511897319 0.12355030647345952
Epoch:  74


0.14628014814209295 0.12353531590529851
Epoch:  75


0.14736629619791702 0.12340502334492547
Epoch:  76


0.1459254231807348 0.12338778589453016
Epoch    76: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  77


0.14778513400941282 0.12336435807602746
Epoch:  78


0.14702403988387133 0.12336234109742301
Epoch:  79


0.14627269997790054 0.12330232454197747
Epoch:  80


0.14618776859463872 0.12359768152236938
Epoch:  81


0.1466489184547115 0.12359587848186493
Epoch:  82


0.14712393163023768 0.1234861548457827
Epoch:  83


0.1485896879756773 0.12365025814090456
Epoch:  84


0.1469130314685203 0.12354277393647603
Epoch:  85


0.14719268719892245 0.12347685758556638
Epoch    85: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  86


0.14679791999829783 0.12346932824168887
Epoch:  87


0.14802259408138893 0.12338808711086001
Epoch:  88


0.14853436238056905 0.12342308248792376
Epoch:  89


0.14666608016233187 0.12344626230852944
Epoch:  90


0.14693200507679502 0.12324136814900807
Epoch:  91


0.14687074277852033 0.12350174252476011
Epoch:  92


0.1476056833524962 0.1236555767910821
Epoch:  93


0.14656365441309438 0.12352591007947922
Epoch:  94


0.14690446330083384 0.12334881510053362
Epoch:  95


0.1474037883249489 0.1233411974140576
Epoch:  96


0.1469948984481193 0.12348777800798416
Epoch:  97


0.14816978211338455 0.12361000265393939
Epoch:  98


0.14663643369803558 0.12370048889092036
Epoch:  99


0.14667375466308077 0.12360255633081708
