In [1]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [2]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [3]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [4]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [5]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [6]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [7]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [8]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [9]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [10]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [11]:
until_x = None

In [12]:
# Parameters
until_x = 7


In [13]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [14]:
# Instantiate the model
model = Task5Model(31).to(device)

In [15]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [16]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6348504859047968 0.5110226103237697
Epoch:  1


0.3344506915356662 0.20847685847963607
Epoch:  2


0.1913898466406642 0.17093892395496368
Epoch:  3


0.18017729271102595 0.17849116453102656
Epoch:  4


0.1772361961570946 0.1590660661458969
Epoch:  5


0.17405710308938413 0.18399146412100112
Epoch:  6


0.17298342488907478 0.17556593886443547
Epoch:  7


0.17003079765551798 0.15580560054097856
Epoch:  8


0.17102567811270017 0.15902713792664663
Epoch:  9


0.17036340123898275 0.15739028368677413
Epoch:  10


0.16811445355415344 0.16352214770657675
Epoch:  11


0.16788517059506597 0.149364567228726
Epoch:  12


0.16662558349403175 0.15345988741942815
Epoch:  13


0.16491212514606682 0.1521554695708411
Epoch:  14


0.16397421706367182 0.1422085708805493
Epoch:  15


0.16243717678495356 0.14057928217308863
Epoch:  16


0.16397860565701047 0.13849191686936788
Epoch:  17


0.1619890976596523 0.13722462845700129
Epoch:  18


0.1605614923142098 0.13658290462834494
Epoch:  19


0.1606710444431047 0.1393335195524352
Epoch:  20


0.1592214506220173 0.14023538998195104
Epoch:  21


0.15757648526011286 0.13366322538682393
Epoch:  22


0.15878549822278926 0.1330916083284787
Epoch:  23


0.15813349106827299 0.1375461838075093
Epoch:  24


0.15728005323861097 0.13322167098522186
Epoch:  25


0.15801629424095154 0.1305418461561203
Epoch:  26


0.15824349103747187 0.13718092973743165
Epoch:  27


0.1584847605711705 0.13166054444653646
Epoch:  28


0.15768692219579541 0.14146648879562104
Epoch:  29


0.15703822068265966 0.1318842044898442
Epoch:  30


0.15625634547826406 0.13308104979140417
Epoch:  31


0.15490227696057912 0.13708530153547013
Epoch    31: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  32


0.15483477308943466 0.12841601031167166
Epoch:  33


0.1534857480107127 0.1276309745652335
Epoch:  34


0.15290104658217044 0.12707484726394927
Epoch:  35


0.15266125387436635 0.12694465156112397
Epoch:  36


0.15163551713969256 0.12815568383250917
Epoch:  37


0.15323568518097336 0.12772526166268758
Epoch:  38


0.15301578350969264 0.1275932246020862
Epoch:  39


0.15212366508470998 0.12707832242761338
Epoch:  40


0.15236859627672145 0.1266278177499771
Epoch:  41


0.15168298619824486 0.12625588583094732
Epoch:  42


0.15226111943657333 0.1263203461255346
Epoch:  43


0.1516716653430784 0.1262370922735759
Epoch:  44


0.15164734342613737 0.12555961630174092
Epoch:  45


0.1500966629466495 0.12647088723523275
Epoch:  46


0.15148201061261668 0.12651811007942473
Epoch:  47


0.15137752326759132 0.12635848351887294
Epoch:  48


0.15071287791471225 0.1256606536252158
Epoch:  49


0.15124565805937792 0.12640002369880676
Epoch:  50


0.1508470725368809 0.1258073757801737
Epoch    50: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  51


0.15112907217966542 0.12625332389559066
Epoch:  52


0.15054930545188286 0.12586708260434015
Epoch:  53


0.15182979787523682 0.12599406710692815
Epoch:  54


0.15215434899201263 0.12593855495963777
Epoch:  55


0.15146922823545095 0.1257338970899582
Epoch:  56


0.15045325095589096 0.12577411000217711
Epoch    56: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  57


0.14996094840603905 0.12598658140216554
Epoch:  58


0.15105571940138535 0.12607658015830175
Epoch:  59


0.14976395586052457 0.12583396690232412
Epoch:  60


0.15000954512003306 0.12572454341820308
Epoch:  61


0.15049807002415527 0.1258037824715887
Epoch:  62


0.15026717492052027 0.12634452219520295
Epoch    62: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  63


0.15151274486168012 0.12546826047556742
Epoch:  64


0.15076629898032626 0.1257748305797577
Epoch:  65


0.14995578857692513 0.12585164606571198
Epoch:  66


0.15172542551079313 0.12572793981858663
Epoch:  67


0.14961629662964795 0.1258906881724085
Epoch:  68


0.15071546870308952 0.12588831888777868
Epoch:  69


0.150916342799728 0.12580100340502604
Epoch    69: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  70


0.150666976699958 0.12554990287337983
Epoch:  71


0.15020982155928742 0.12609363240855082
Epoch:  72


0.15210832212422346 0.1260129458137921
Epoch:  73


0.15040832274668925 0.12611050690923417
Epoch:  74


0.15074381312808474 0.12585767252104624
Epoch:  75


0.15050110261182528 0.12605895634208406
Epoch:  76


0.15291191637516022 0.12568807069744384
Epoch:  77


0.15079702235556938 0.12572601437568665
Epoch:  78


0.15202232389836698 0.1260014208299773
Epoch:  79


0.15049154170461604 0.12548095413616725
Epoch:  80


0.15061181219848427 0.12600593907492502
Epoch:  81


0.15088319657622157 0.1256702999983515
Epoch:  82


0.15133814272042867 0.12606418558529445
Epoch:  83


0.15144483785371524 0.12584575372082846
Epoch:  84


0.15128888793893763 0.12573924022061483
Epoch:  85


0.15072209609521403 0.12568917338337218
Epoch:  86


0.15068656769958702 0.12588890961238317
Epoch:  87


0.1499131366207793 0.12544035485812596
Epoch:  88


0.151344657347009 0.12578489312103816
Epoch:  89


0.15071296772441348 0.12581675393240793
Epoch:  90


0.1486251656268094 0.12610231659242085
Epoch:  91


0.15071709534606417 0.12556040180580957
Epoch:  92


0.151164015402665 0.12597153655120305
Epoch:  93


0.14989950447469144 0.1256260031035968
Epoch:  94


0.15096068140622732 0.12561257715736115
Epoch:  95


0.1514374432531563 0.12538304499217443
Epoch:  96


0.15057968891955711 0.12553710490465164
Epoch:  97


0.15204822292199005 0.12573848239013127
Epoch:  98


0.15041185955743533 0.12616967196975434
Epoch:  99


0.15090915681542577 0.12592329510620662
