In [1]:
# Parameters
until_x = 1


In [2]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [3]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [4]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [5]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [6]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [7]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [8]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [9]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [10]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [12]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset after ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i > until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [13]:
# Instantiate the model
model = Task5Model(31).to(device)

In [14]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6096840033660064 0.4347627205508096
Epoch:  1


0.28122948230923833 0.19457894137927464
Epoch:  2


0.188349467274305 0.17937234044075012
Epoch:  3


0.18264487748210495 0.1786500619990485
Epoch:  4


0.17924129600460464 0.17253247116293227
Epoch:  5


0.17915515359994527 0.1794178741318839
Epoch:  6


0.17856094563329541 0.17104316822120122
Epoch:  7


0.1783449976025401 0.19380971150738852
Epoch:  8


0.17882912102583293 0.17745740711688995
Epoch:  9


0.1758921122228777 0.1659974136522838
Epoch:  10


0.1764471784636781 0.1879473328590393
Epoch:  11


0.17502904663214813 0.17105343725000108
Epoch:  12


0.17397088376251427 0.1797159697328295
Epoch:  13


0.17418373235174128 0.1684126853942871
Epoch:  14


0.17405469473954793 0.15945794965539659
Epoch:  15


0.17351112695964607 0.15624019929340907
Epoch:  16


0.17325393050103574 0.15607797460896627
Epoch:  17


0.1724407233901926 0.15937558668000357
Epoch:  18


0.17206571392110875 0.15654551770005906
Epoch:  19


0.1705049268297247 0.15410983775343215
Epoch:  20


0.17058108706732053 0.15593686699867249
Epoch:  21


0.17067739005024368 0.16266719784055436
Epoch:  22


0.16953980882425565 0.1515347020966666
Epoch:  23


0.170414336629816 0.15941948762961797
Epoch:  24


0.16788130233416687 0.15975368235792434
Epoch:  25


0.16721038802250013 0.15475226938724518
Epoch:  26


0.16652169823646545 0.15050205375467027
Epoch:  27


0.16437872398544 0.15296868554183415
Epoch:  28


0.1638721372630145 0.146574610045978
Epoch:  29


0.16290265684192246 0.15050197286265238
Epoch:  30


0.16185360865013018 0.14377717673778534
Epoch:  31


0.16058949401249756 0.1425823239343507
Epoch:  32


0.16237864462105003 0.14187711903027125
Epoch:  33


0.16002980680079074 0.13640402470316207
Epoch:  34


0.16020250400981387 0.14652831852436066
Epoch:  35


0.16043833663334717 0.13671295877013886
Epoch:  36


0.15939354574358142 0.1395552626677922
Epoch:  37


0.1590863740121996 0.13742399322135107
Epoch:  38


0.1576812613654781 0.13936509511300496
Epoch:  39


0.15747851173619967 0.13512307405471802
Epoch:  40


0.15683168011742668 0.14468692668846675
Epoch:  41


0.1572898143046611 0.13449715397187642
Epoch:  42


0.1572028338103681 0.13264333776065282
Epoch:  43


0.1571676916367299 0.13319835598979676
Epoch:  44


0.15583948629933433 0.13595567324331828
Epoch:  45


0.15671271327379588 0.130543124462877
Epoch:  46


0.15601211383536057 0.13306522050074168
Epoch:  47


0.15634497275223602 0.13271608735833848
Epoch:  48


0.15497344128183416 0.1303201573235648
Epoch:  49


0.1554342451933268 0.13164207339286804
Epoch:  50


0.1545681212399457 0.1326659992337227
Epoch:  51


0.15410155621734825 0.13177963771990367
Epoch:  52


0.15420914542030645 0.129536435008049
Epoch:  53


0.15455421926202 0.1349089198878833
Epoch:  54


0.1536245140674952 0.13531503613506043
Epoch:  55


0.15445987115035187 0.1306070483156613
Epoch:  56


0.15359561588313128 0.13366135316235678
Epoch:  57


0.15261081988747055 0.13070878492934362
Epoch:  58


0.15342267340904958 0.12921056577137538
Epoch:  59


0.15188836648657517 0.1287767801965986
Epoch:  60


0.15131195976927475 0.13078122586011887
Epoch:  61


0.15282850692401062 0.1311687763248171
Epoch:  62


0.1519627663734797 0.13035968158926284
Epoch:  63


0.15202893477839394 0.13235856273344584
Epoch:  64


0.1514287816511618 0.13005093804427556
Epoch:  65


0.15153657463756767 0.12916947369064605
Epoch    65: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  66


0.15133642667048686 0.1265719926783017
Epoch:  67


0.14958189669493083 0.12598934131009237
Epoch:  68


0.149485693187327 0.12531580669539316
Epoch:  69


0.15090756561305071 0.12561102424349105
Epoch:  70


0.1497658068263853 0.1251410905803953
Epoch:  71


0.14943502803106565 0.12535597277539118
Epoch:  72


0.14869507220951286 0.1251827978662082
Epoch:  73


0.1488006662678074 0.12546661176851817
Epoch:  74


0.14796163625008352 0.12469343841075897
Epoch:  75


0.14897782697870926 0.1250358766743115
Epoch:  76


0.14974472087782784 0.12493208582912173
Epoch:  77


0.15017494196827347 0.12459979206323624
Epoch:  78


0.14850080738196503 0.12477053276130132
Epoch:  79


0.14815725909697042 0.12516514318329947
Epoch:  80


0.14844887159966133 0.12501208590609686
Epoch:  81


0.1496426024146982 0.12506691898618424
Epoch:  82


0.14901702267092629 0.12453841418027878
Epoch:  83


0.1476768863362235 0.12509650737047195
Epoch:  84


0.1488466874973194 0.1244655389870916
Epoch:  85


0.14744682126754038 0.12526434234210423
Epoch:  86


0.14929577627697507 0.12495983285563332
Epoch:  87


0.1459231505522857 0.12424485704728536
Epoch:  88


0.14728825599760623 0.12503717839717865
Epoch:  89


0.14785543809065949 0.12492692683424268
Epoch:  90


0.14768743112280563 0.12435051798820496
Epoch:  91


0.1479467810005755 0.12483515696866172
Epoch:  92


0.1485911035859907 0.12441246850149972
Epoch:  93


0.14781072171958717 0.12459837006671089
Epoch    93: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  94


0.14848790015723254 0.12456241675785609
Epoch:  95


0.148169000406523 0.12437435984611511
Epoch:  96


0.14705133518657168 0.12427133747509547
Epoch:  97


0.14852950782389254 0.12444137781858444
Epoch:  98


0.14702633305175886 0.12433787648166929
Epoch:  99


0.1481344643476847 0.12438428508383888
Epoch    99: reducing learning rate of group 0 to 1.0000e-06.
