In [1]:
# Parameters
until_x = 8


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [12]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [13]:
# Instantiate the model
model = Task5Model(31).to(device)

In [14]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.611291812883841 0.4859614074230194
Epoch:  1


0.27702840236393184 0.178997774209295
Epoch:  2


0.1797893361465351 0.15594685077667236
Epoch:  3


0.17043198806208534 0.15056186275822775
Epoch:  4


0.16677146102931048 0.1487884862082345
Epoch:  5


0.16526194923632853 0.14741180198533194
Epoch:  6


0.1645327081551423 0.1474728403346879
Epoch:  7


0.1615003069510331 0.14334770824228013
Epoch:  8


0.16020609639786385 0.13818908908537456
Epoch:  9


0.15982904788610097 0.1391329105411257
Epoch:  10


0.16004832169494113 0.13947425995554244
Epoch:  11


0.15889634272536715 0.13377077451774053
Epoch:  12


0.15877471944770297 0.13457390878881728
Epoch:  13


0.15778807490258603 0.14701056906155177
Epoch:  14


0.15714024973882212 0.13274904872689927
Epoch:  15


0.15662214884886871 0.1373072149498122
Epoch:  16


0.1555893405063732 0.13127289073807852
Epoch:  17


0.15603697541597727 0.13347362939800536
Epoch:  18


0.15479285652573044 0.13088720824037278
Epoch:  19


0.15304272279546066 0.1308587778891836
Epoch:  20


0.15414639220044418 0.1316003171460969
Epoch:  21


0.1530682879525262 0.13172444062573568
Epoch:  22


0.15393713519379898 0.13340667741639273
Epoch:  23


0.1520911688740189 0.13006099313497543
Epoch:  24


0.1531801159317429 0.13021034534488404
Epoch:  25


0.15355265502994125 0.13643506275756018
Epoch:  26


0.15189219326586337 0.1292321884206363
Epoch:  27


0.15286786934813937 0.1317997702530452
Epoch:  28


0.15227028163703712 0.12861743888684682
Epoch:  29


0.1521709633840097 0.13182168028184346
Epoch:  30


0.1505561728735228 0.1279618271759578
Epoch:  31


0.15166344111030167 0.1360761182648795
Epoch:  32


0.15132287627941854 0.133061517562185
Epoch:  33


0.15188480390084758 0.12974530032702855
Epoch:  34


0.14913604630006327 0.12979055089609964
Epoch:  35


0.15023157926830086 0.12710707208939961
Epoch:  36


0.149547269618189 0.12676641664334706
Epoch:  37


0.15010359118113648 0.1264422982931137
Epoch:  38


0.1510606545048791 0.13540694011109217
Epoch:  39


0.15190787734212102 0.12806364255292074
Epoch:  40


0.15084342296059067 0.12842323631048203
Epoch:  41


0.1483308088940543 0.12842129170894623
Epoch:  42


0.14711522774116412 0.12691929191350937
Epoch:  43


0.14996638652440664 0.12833143983568465
Epoch    43: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  44


0.14780217288313685 0.1241556044135775
Epoch:  45


0.14611356363103195 0.12382660380431584
Epoch:  46


0.14646698010934367 0.12355710991791316
Epoch:  47


0.14535815208344846 0.12328229738133294
Epoch:  48


0.1463837957865483 0.12380297588450569
Epoch:  49


0.1465269041222495 0.1241438654916627
Epoch:  50


0.14612077619578387 0.123763379241739
Epoch:  51


0.14514775936667984 0.12279444400753294
Epoch:  52


0.14450343839220098 0.12308249409709658
Epoch:  53


0.1443712610650707 0.12309073443923678
Epoch:  54


0.14568290356043223 0.12370633014610835
Epoch:  55


0.14427421503775828 0.12316959883485522
Epoch:  56


0.14585881942027323 0.12347918323108129
Epoch:  57


0.14385431521647685 0.12355797737836838
Epoch    57: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  58


0.1437781759210535 0.1230064257979393
Epoch:  59


0.14454925140819033 0.12287607469729014
Epoch:  60


0.14362828997341362 0.12316947749682836
Epoch:  61


0.14380111968195117 0.12283993193081447
Epoch:  62


0.14481920084437808 0.12277492135763168
Epoch:  63


0.14412025301843076 0.12306134615625654
Epoch:  64


0.1435043014384605 0.12295113929680415
Epoch:  65


0.14553897646633354 0.12318190080778939
Epoch:  66


0.14464057418140205 0.12297268424715314
Epoch:  67


0.14290653088608304 0.1232033861534936
Epoch:  68


0.14442963696814873 0.12307904554264885
Epoch    68: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  69


0.14487213621268402 0.12306550251586097
Epoch:  70


0.14479654362878283 0.1231143634234156
Epoch:  71


0.143675530681739 0.12298836346183505
Epoch:  72


0.14388639177825 0.12311631973300662
Epoch:  73


0.14439193541939194 0.12317193512405668
Epoch:  74


0.1448435271913941 0.12289120363337654
Epoch    74: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  75


0.14333036743305824 0.12324823971305575
Epoch:  76


0.14486046618706472 0.12284065357276372
Epoch:  77


0.14435771188220461 0.12315477111509868
Epoch:  78


0.14478247270390793 0.12275593940700803
Epoch:  79


0.1447407071654861 0.12311898597649165
Epoch:  80


0.14418863646081975 0.12300883552857808
Epoch:  81


0.14451270006798408 0.12319461469139371
Epoch:  82


0.1437676782543595 0.1229787969163486
Epoch:  83


0.14524904860032573 0.12312039839369911
Epoch:  84


0.14478600145997228 0.12290569501263755
Epoch    84: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  85


0.14429030587544311 0.12313891627958842
Epoch:  86


0.14419625296786026 0.12318956639085497
Epoch:  87


0.14569448860915932 0.12309412977525166
Epoch:  88


0.14492013889390068 0.1229801316346441
Epoch:  89


0.1445887845915717 0.12291499653032847
Epoch:  90


0.1429447918324857 0.12311483068125588
Epoch:  91


0.14464865785998268 0.1228360842381205
Epoch:  92


0.14554931022025444 0.12281004445893424
Epoch:  93


0.14463736559893633 0.12306668077196393
Epoch:  94


0.14561108318535057 0.12290012836456299
Epoch:  95


0.14404879671496315 0.12337746471166611
Epoch:  96


0.1440014021622168 0.12323492233242307
Epoch:  97


0.14483548257801984 0.12303399401051658
Epoch:  98


0.14416341926600482 0.12314232864550181
Epoch:  99


0.144848187630241 0.12296124015535627
