In [1]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor



In [2]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [3]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [4]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [5]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [6]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [7]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [8]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [9]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [10]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [11]:
until_x = None

In [12]:
# Parameters
until_x = 17


In [13]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [14]:
# Instantiate the model
model = Task5Model(31).to(device)

In [15]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [16]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6224274095651265 0.4819712383406503
Epoch:  1


0.30866723648599675 0.21016993267195566
Epoch:  2


0.18897398903563217 0.1817486435174942
Epoch:  3


0.17894712976507238 0.16669157147407532
Epoch:  4


0.17720032825663284 0.2011821504150118
Epoch:  5


0.1753787011713595 0.17093045370919363
Epoch:  6


0.17372544915289492 0.17293491746698106
Epoch:  7


0.173271133690267 0.16871748013155802
Epoch:  8


0.17282033691535126 2.4743124076298306
Epoch:  9


0.1714789130397745 0.26623349104608807
Epoch     9: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  10


0.17053597279497096 0.15360685757228307
Epoch:  11


0.16924964213693464 0.15370434522628784
Epoch:  12


0.167319095618016 0.1543564008814948
Epoch:  13


0.1685340501166679 0.1509353071451187
Epoch:  14


0.1676776022524447 0.15135729951517923
Epoch:  15


0.16783551227402044 0.14887047665459768
Epoch:  16


0.16731909924262278 0.15048328042030334
Epoch:  17


0.16826282240249016 0.15049290231295995
Epoch:  18


0.16659850405680166 0.14893529457705362
Epoch:  19


0.16610692038729385 0.14898997119494847
Epoch:  20


0.1661109110793552 0.14782509633472987
Epoch:  21


0.16634944725680995 0.1490260192326137
Epoch:  22


0.16612899383983096 0.14589060417243413
Epoch:  23


0.16582010565577326 0.14836482916559493
Epoch:  24


0.16565139672240695 0.14868482947349548
Epoch:  25


0.16596682732169693 0.14640794907297408
Epoch:  26


0.16403689658319628 0.14642884262970515
Epoch:  27


0.16537285012167854 0.14507436965193068
Epoch:  28


0.16463280006034955 0.1455770539385932
Epoch:  29


0.16534416256724177 0.14453663996287755
Epoch:  30


0.1652916822884534 0.14419304792370116
Epoch:  31


0.1647383880776328 0.14221047503607614
Epoch:  32


0.1652649415505899 0.14425544334309442
Epoch:  33


0.16309934411499952 0.1430798822215625
Epoch:  34


0.1634741256365905 0.14296170962708338
Epoch:  35


0.16564055109346235 0.14225252398422786
Epoch:  36


0.16397656097605423 0.1427587792277336
Epoch:  37


0.1632735535905168 0.14177907471145904
Epoch:  38


0.16408043294339567 0.14367627671786717
Epoch:  39


0.1633898233239715 0.14213383623531886
Epoch:  40


0.16338412302571373 0.14175808003970555
Epoch:  41


0.16312500230363897 0.14081387966871262
Epoch:  42


0.16202446576711294 0.14096085407904216
Epoch:  43


0.16342482131880684 0.14263457911355154
Epoch:  44


0.16226900026604935 0.14130218220608576
Epoch:  45


0.16185781520766182 0.13982199451753072
Epoch:  46


0.16107803583145142 0.1417421634708132
Epoch:  47


0.16224596709818453 0.1404569074511528
Epoch:  48


0.16073933002111074 0.13995353558233806
Epoch:  49


0.1621279434577839 0.14103152070726668
Epoch:  50


0.16262889190300092 0.13900219223329
Epoch:  51


0.1620038292697958 0.1402000422988619
Epoch:  52


0.16148180454163938 0.1382363098008292
Epoch:  53


0.16144121136214282 0.13820988897766387
Epoch:  54


0.16059972587469462 0.13957286306789943
Epoch:  55


0.16048931390852542 0.137530260852405
Epoch:  56


0.15990457422024495 0.13843462935515813
Epoch:  57


0.16022330965544726 0.1364624234182494
Epoch:  58


0.15935106776856087 0.13824972191027232
Epoch:  59


0.16022951578771746 0.13696955357279098
Epoch:  60


0.1600888290920773 0.13795522493975504
Epoch:  61


0.16127800981740695 0.13755653372832707
Epoch:  62


0.1618963262519321 0.13926024841410772
Epoch:  63


0.15993193194672867 0.13783178904226848
Epoch    63: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  64


0.15829748519369075 0.13617943333727972
Epoch:  65


0.15879915856026314 0.13665629497596196
Epoch:  66


0.15802176055070516 0.13598150972809112
Epoch:  67


0.1597277260309941 0.13639797270298004
Epoch:  68


0.15818991894657547 0.13559621572494507
Epoch:  69


0.15942262435281598 0.13644046123538697
Epoch:  70


0.15972898981055697 0.13629635316984995
Epoch:  71


0.15835281763527845 0.13558567741087504
Epoch:  72


0.15868452432993296 0.1352935220514025
Epoch:  73


0.158268802472063 0.1354064622095653
Epoch:  74


0.1598355826493856 0.1358175895043782
Epoch:  75


0.1590858001161266 0.13575912799154008
Epoch:  76


0.15777993645216967 0.1352758812052863
Epoch:  77


0.1578578191834527 0.13517870328256062
Epoch:  78


0.1585786507741825 0.13557622688157217
Epoch:  79


0.15762513795414487 0.13538368152720587
Epoch:  80


0.15947948235112266 0.13570308685302734
Epoch:  81


0.1592239362967981 0.13540631426232203
Epoch:  82


0.1585089987194216 0.13574934005737305
Epoch:  83


0.15794118594478918 0.13522092998027802
Epoch    83: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  84


0.15925573859665845 0.13526169529982976
Epoch:  85


0.1589869023174853 0.13560044978346145
Epoch:  86


0.1582504965969034 0.13557964882680348
Epoch:  87


0.1591663541826042 0.13543351207460677
Epoch:  88


0.1587754483963992 0.13536720403603145
Epoch:  89


0.15933605102268425 0.1354632590498243
Epoch    89: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  90


0.1580515785797222 0.13546350811209
Epoch:  91


0.1586105698669279 0.13497633912733623
Epoch:  92


0.15901996799417445 0.13497547592435563
Epoch:  93


0.1583810400318455 0.13531852619988577
Epoch:  94


0.1587455059225495 0.13526373143707002
Epoch:  95


0.15850760405128067 0.13526078845773423
Epoch:  96


0.1581840748722489 0.1356605738401413
Epoch:  97


0.15862714881832535 0.13511680173022406
Epoch    97: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  98


0.15814646554959788 0.13533526446138108
Epoch:  99


0.15759621157839493 0.13559133133717946
