In [1]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [2]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [3]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [4]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [5]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [6]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [7]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [8]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [9]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [10]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [11]:
until_x = None

In [12]:
# Parameters
until_x = 9


In [13]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [14]:
# Instantiate the model
model = Task5Model(31).to(device)

In [15]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [16]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6332928618869266 0.5071962305477687
Epoch:  1


0.32876943293455485 0.19735220074653625
Epoch:  2


0.1928890371644819 0.17482341825962067
Epoch:  3


0.18190914432744723 0.18067712230341776
Epoch:  4


0.17930110564102997 0.18241814630372183
Epoch:  5


0.17658072710037231 0.16798055597714015
Epoch:  6


0.17517699220696012 0.16301497604165757
Epoch:  7


0.17391405033098684 0.166947779910905
Epoch:  8


0.17201188851047206 0.15471425226756505
Epoch:  9


0.17113292901902585 0.1611939434494291
Epoch:  10


0.17022509711819725 0.1579551867076329
Epoch:  11


0.16986447774075172 0.15569796306746347
Epoch:  12


0.1699391232954489 0.1560515229191099
Epoch:  13


0.16736329568398967 0.15219611993857793
Epoch:  14


0.16717290717202263 0.1534020240817751
Epoch:  15


0.16822620984670278 0.15147898665496282
Epoch:  16


0.16798319204433546 0.15209767754588807
Epoch:  17


0.16784758302005562 0.14515709238392965
Epoch:  18


0.16533706518443855 0.1461984749351229
Epoch:  19


0.16567104853488304 0.14377381865467345
Epoch:  20


0.16591532608947238 0.15498422724860056
Epoch:  21


0.16495946049690247 0.14594470815999167
Epoch:  22


0.16519702205786835 0.14576386553900583
Epoch:  23


0.16357353772666003 0.14729974099567958
Epoch:  24


0.1637488543181806 0.15698306475366866
Epoch:  25


0.16303941688022097 0.15096734251294816
Epoch    25: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  26


0.16181347942030108 0.13906903777803695
Epoch:  27


0.15992431624515638 0.13832019163029535
Epoch:  28


0.1598644896938994 0.1379517274243491
Epoch:  29


0.16005955314314044 0.1413165916289602
Epoch:  30


0.1604079012129758 0.1374479298080717
Epoch:  31


0.15972230845206492 0.13807164345468795
Epoch:  32


0.16005819552653544 0.1391505609665598
Epoch:  33


0.1600638636060663 0.13743435059274947
Epoch:  34


0.1609120115235045 0.13797252199479512
Epoch:  35


0.15931202994810567 0.13731415463345392
Epoch:  36


0.15806955340746287 0.13815878544534957
Epoch:  37


0.1590123498761976 0.13748160856110708
Epoch:  38


0.15893167539222822 0.13742999413183757
Epoch:  39


0.15768793064194755 0.13661126685994013
Epoch:  40


0.16035791708005442 0.13776217294590815
Epoch:  41


0.15910433917432218 0.13757041203124182
Epoch:  42


0.1606656808305431 0.1368002582873617
Epoch:  43


0.15861861125842944 0.13656947123152868
Epoch:  44


0.15752459820863363 0.13630525661366327
Epoch:  45


0.15855329745524638 0.13676769605704717
Epoch:  46


0.15817644265857903 0.13714986720255443
Epoch:  47


0.15744818022122253 0.13672466895409993
Epoch:  48


0.1583520717717506 0.13632082832711084
Epoch:  49


0.1574490376420923 0.13592318764754704
Epoch:  50


0.1583000175856255 0.13616762310266495
Epoch:  51


0.15813729609992053 0.13578894840819494
Epoch:  52


0.1563247599311777 0.13540375445570266
Epoch:  53


0.15730427608296677 0.13521198821919306
Epoch:  54


0.15737151133047567 0.13639981406075613
Epoch:  55


0.15773025396707896 0.1359616070985794
Epoch:  56


0.15818429476506002 0.13596159113304956
Epoch:  57


0.15816837549209595 0.13667981965201242
Epoch:  58


0.15812623782737836 0.13606032303401402
Epoch:  59


0.1565931511891855 0.13539654122931616
Epoch    59: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  60


0.1579754300214149 0.1358407437801361
Epoch:  61


0.1567389755635648 0.1350682122366769
Epoch:  62


0.15759260428918376 0.13512994987624033
Epoch:  63


0.1557922862671517 0.1347406314952033
Epoch:  64


0.1584027022928805 0.13522891380957194
Epoch:  65


0.157763664384146 0.1349878226007734
Epoch:  66


0.15801927970873342 0.1347652929169791
Epoch:  67


0.15640234141736417 0.13518443490777696
Epoch:  68


0.15553276079732017 0.13510236782687052
Epoch:  69


0.1566306187494381 0.13528232915060862
Epoch    69: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  70


0.1558940845566827 0.13485787276710784
Epoch:  71


0.1576279043345838 0.1353287622332573
Epoch:  72


0.15615162776934133 0.1349222074661936
Epoch:  73


0.15634957439190633 0.13490430265665054
Epoch:  74


0.15706285391304944 0.13490364700555801
Epoch:  75


0.1581816806181057 0.1348553363765989
Epoch    75: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  76


0.15875841556368647 0.13504551670381001
Epoch:  77


0.15609068040912216 0.13486329785415105
Epoch:  78


0.1565176220359029 0.13491249403783254
Epoch:  79


0.1570147110803707 0.13464039989880153
Epoch:  80


0.15689177448685104 0.13550081316913878
Epoch:  81


0.1552549781831535 0.1346763393708638
Epoch:  82


0.1570646392332541 0.1350519359111786
Epoch:  83


0.15698945240394488 0.13571578796420777
Epoch:  84


0.15709573634572932 0.1349381527730397
Epoch:  85


0.15685835077955917 0.13499004180942262
Epoch    85: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  86


0.1548599540381818 0.13474202475377492
Epoch:  87


0.15690903003151352 0.13475168922117778
Epoch:  88


0.15653323523096135 0.13517187855073384
Epoch:  89


0.15655267198343534 0.13513618281909398
Epoch:  90


0.155962488135776 0.13529604034764425
Epoch:  91


0.15901136760776108 0.13551551210028784
Epoch:  92


0.1562311105631493 0.1350396658693041
Epoch:  93


0.1565680000427607 0.134960019162723
Epoch:  94


0.15805891115923185 0.135406388768128
Epoch:  95


0.1575870912622761 0.13521018837179458
Epoch:  96


0.15744647585056923 0.13494523827518737
Epoch:  97


0.15603683405631297 0.13496229265417373
Epoch:  98


0.1564540198525867 0.1356634029320308
Epoch:  99


0.15578592669319463 0.1351568496653012
