In [1]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import sys
sys.path.append("..")
from RandomErasing import RandomErasing

import torchvision.models
from torchvision import transforms

from albumentations import Compose, ShiftScaleRotate, GridDistortion
from albumentations.pytorch import ToTensor

In [2]:
def prepare_data(df, unknown_to_known):
    df = df.reset_index()
    df['slno'] = df.assign(slno=1).groupby('audio_filename')['slno'].cumsum()
    df.set_index(['audio_filename', 'slno'], inplace=True)

    df_unknown = df.copy().loc[:, list(unknown_to_known.keys())]
    df.drop(columns=list(unknown_to_known.keys()), inplace=True)

    y_mask = df.copy()
    y_mask.loc[:, :] = 1
    for unknown, known in unknown_to_known.items():
        y_mask.loc[
            df_unknown[unknown] > 0.5,
            known
        ] = 0

    df = df.swaplevel(i=1, j=0, axis=0).sort_index()

    y_mask = y_mask.swaplevel(i=1, j=0, axis=0).sort_index()

    y = np.concatenate([
        df.loc[[1], :].values[..., np.newaxis],
        df.loc[[2], :].values[..., np.newaxis],
        df.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    y_mask = np.concatenate([
        y_mask.loc[[1], :].values[..., np.newaxis],
        y_mask.loc[[2], :].values[..., np.newaxis],
        y_mask.loc[[3], :].values[..., np.newaxis]
    ], axis=2)

    X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in df.loc[[1], :].reset_index(1).audio_filename.tolist()])
    X = np.expand_dims(X, axis=1)

    return X, y, y_mask


random_erasing = RandomErasing()


class AudioDataset(Dataset):

    def __init__(self, X, y, weights, transform=None):
        self.X = X
        self.y = y
        self.weights = weights
        self.transform = transform
        self.pil = transforms.ToPILImage()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]

        if self.transform:
            # min-max transformation
            this_min = sample.min()
            this_max = sample.max()
            sample = (sample - this_min) / (this_max - this_min)

            # randomly cycle the file
            i = np.random.randint(sample.shape[1])
            sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)

            # apply albumentations transforms
            sample = np.array(self.pil(sample))
            sample = self.transform(image=sample)
            sample = sample['image']
            sample = sample[None, :, :].permute(0, 2, 1)

            # apply random erasing
            sample = random_erasing(sample.clone().detach())

            # revert min-max transformation
            sample = (sample * (this_max - this_min)) + this_min

        return sample, self.y[idx, ...], self.weights[idx, ...]

In [3]:
# Load and prepare data
with open('../../data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

unknown_to_known = (
    pd.merge(metadata['taxonomy_df'].loc[lambda x: x.fine_id == 'X', ['fine', 'coarse']],
             metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X', ['fine', 'coarse']],
             on='coarse', how='inner')
    .drop(columns='coarse')
    .groupby('fine_x')['fine_y']
    .apply(lambda x: list(x)).to_dict())
known_labels = metadata['taxonomy_df'].loc[lambda x: x.fine_id != 'X'].fine.tolist()

train_df = pd.concat([metadata['coarse_train'], metadata['fine_train']], axis=1, sort=True)
valid_df = pd.concat([metadata['coarse_test'], metadata['fine_test']], axis=1, sort=True)

In [4]:
# manual correction for one data point
train_df.loc[(train_df.sum(axis=1) == 37).copy(), :] = 0
valid_df.loc[(valid_df.sum(axis=1) == 37).copy(), :] = 0

In [5]:
train_X, train_y, train_y_mask = prepare_data(train_df, unknown_to_known)
valid_X, valid_y, valid_y_mask = prepare_data(valid_df, unknown_to_known)

In [6]:
# Channel wise normalization
channel_means = train_X.reshape(-1, 128).mean(axis=0).reshape(1, 1, 1, -1)
channel_stds = train_X.reshape(-1, 128).std(axis=0).reshape(1, 1, 1, -1)
train_X = (train_X - channel_means) / channel_stds
valid_X = (valid_X - channel_means) / channel_stds

In [7]:
# Define the data augmentation transformations
albumentations_transform = Compose([
    ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0.5),
    GridDistortion(),
    ToTensor()
])

In [8]:
# Create the datasets and the dataloaders
train_dataset = AudioDataset(torch.Tensor(train_X),
                             torch.Tensor(train_y),
                             torch.Tensor(train_y_mask),
                             albumentations_transform)
valid_dataset = AudioDataset(torch.Tensor(valid_X),
                             torch.Tensor(valid_y),
                             torch.Tensor(valid_y_mask),
                             None)

val_loader = DataLoader(valid_dataset, 64, shuffle=False)
train_loader_1 = DataLoader(train_dataset, 64, shuffle=True)
train_loader_2 = DataLoader(train_dataset, 64, shuffle=True)

In [9]:
# Define the device to be used
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [10]:
def weight_reset(layer):
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

In [11]:
until_x = None

In [12]:
# Parameters
until_x = 9


In [13]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()

        self.bw2col = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 10, 1, padding=0), nn.ReLU(),
            nn.Conv2d(10, 3, 1, padding=0), nn.ReLU())

        self.mv2 = torchvision.models.mobilenet_v2(pretrained=True)

        self.final = nn.Sequential(
            nn.Linear(1280, 512), nn.ReLU(), nn.BatchNorm1d(512),
            nn.Linear(512, num_classes))
        
        # Reset until ith layer of mv2
        for i, x in enumerate(self.mv2.features.children()):
            if i <= until_x:
                x.apply(weight_reset)

    def forward(self, x):
        x = self.bw2col(x)
        x = self.mv2.features(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [14]:
# Instantiate the model
model = Task5Model(31).to(device)

In [15]:
# Define optimizer, scheduler and loss criteria
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
criterion = nn.BCEWithLogitsLoss(reduction='none')

In [16]:
epochs = 100
train_loss_hist = []
valid_loss_hist = []
lowest_val_loss = np.inf
epochs_without_new_lowest = 0

for i in range(epochs):
    print('Epoch: ', i)

    this_epoch_train_loss = 0
    for i1, i2 in zip(train_loader_1, train_loader_2):

        # mixup the inputs ---------
        alpha = 1
        mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
        inputs = (lam * i1[0]) + ((1 - lam) * i2[0])

        lam = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1))
        labels = (lam * i1[1]) + ((1 - lam) * i2[1])
        masks = (lam * i1[2]) + ((1 - lam) * i2[2])
        # mixup ends ----------

        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            model = model.train()
            outputs = model(inputs)
            # calculate loss for each set of annotations
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            loss.backward()
            optimizer.step()
            this_epoch_train_loss += loss.detach().cpu().numpy()

    this_epoch_valid_loss = 0
    for inputs, labels, masks in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            model = model.eval()
            outputs = model(inputs)
            loss_0 = criterion(outputs, labels[:, :, 0]) * masks[:, :, 0]
            loss_1 = criterion(outputs, labels[:, :, 1]) * masks[:, :, 1]
            loss_2 = criterion(outputs, labels[:, :, 2]) * masks[:, :, 2]
            loss = (loss_0.sum() + loss_1.sum() + loss_2.sum()) / masks.sum()
            this_epoch_valid_loss += loss.detach().cpu().numpy()

    this_epoch_train_loss /= len(train_loader_1)
    this_epoch_valid_loss /= len(val_loader)

    train_loss_hist.append(this_epoch_train_loss)
    valid_loss_hist.append(this_epoch_valid_loss)

    if this_epoch_valid_loss < lowest_val_loss:
        lowest_val_loss = this_epoch_valid_loss
        torch.save(model.state_dict(), './model_system1_until_{}'.format(until_x))
        epochs_without_new_lowest = 0
    else:
        epochs_without_new_lowest += 1

    if epochs_without_new_lowest >= 25:
        break

    print(this_epoch_train_loss, this_epoch_valid_loss)

    scheduler.step(this_epoch_valid_loss)

Epoch:  0


0.6354089169888884 0.5064420785222735
Epoch:  1


0.33472813263132767 0.20747901499271393
Epoch:  2


0.19110658039917816 0.16741499517645156
Epoch:  3


0.1795896529345899 0.1733043087380273
Epoch:  4


0.1745082200378985 0.1708536446094513
Epoch:  5


0.1741253725580267 0.1722688845225743
Epoch:  6


0.17102474860242894 0.1621606945991516
Epoch:  7


0.16874408278916334 0.16051746691976274
Epoch:  8


0.16974369013631666 0.17297516337462834
Epoch:  9


0.16844397541638967 0.15039281972817012
Epoch:  10


0.16567467596079852 0.1528526246547699
Epoch:  11


0.1651479287727459 0.1558391238961901
Epoch:  12


0.16492476294169556 0.14428867612566268
Epoch:  13


0.16391940656546 0.15162532457283565
Epoch:  14


0.1622343808412552 0.1431172979729516
Epoch:  15


0.16181529172368952 0.13767407728093012
Epoch:  16


0.16144373126932093 0.1418410220316478
Epoch:  17


0.16162393906631986 0.1400893075125558
Epoch:  18


0.16047319206031593 0.13499018124171666
Epoch:  19


0.15939031501074094 0.1327372874532427
Epoch:  20


0.15907457913901354 0.1336375613297735
Epoch:  21


0.15845425507506808 0.14325359357254847
Epoch:  22


0.15936162020709063 0.13233307216848647
Epoch:  23


0.1585805492626654 0.13200814383370535
Epoch:  24


0.15761386905167554 0.13322474275316512
Epoch:  25


0.15741467677258156 0.13432172472987855
Epoch:  26


0.15637052582727895 0.1337719847049032
Epoch:  27


0.15791572187397931 0.1352385782769748
Epoch:  28


0.1574350897524808 0.13269297352858953
Epoch:  29


0.15559494777305707 0.12903090992144176
Epoch:  30


0.15599479143683975 0.13566578711782182
Epoch:  31


0.15627882085941933 0.1341452630502837
Epoch:  32


0.15444578995575775 0.12873388720410212
Epoch:  33


0.15514972564336416 0.13050710835627147
Epoch:  34


0.15476051657586484 0.12841700975384032
Epoch:  35


0.15484526189597878 0.13211078303200857
Epoch:  36


0.1548251360654831 0.12836869486740657
Epoch:  37


0.15453793551470782 0.1295265108346939
Epoch:  38


0.1544666531923655 0.1326127957020487
Epoch:  39


0.1531599498278386 0.1293226916875158
Epoch:  40


0.1531184126396437 0.1320517754980496
Epoch:  41


0.15437140577548258 0.13042709337813513
Epoch:  42


0.1526416950934642 0.13232983223029546
Epoch    42: reducing learning rate of group 0 to 1.0000e-04.
Epoch:  43


0.1525542434002902 0.12528808095625468
Epoch:  44


0.15035730841997508 0.12509129409279143
Epoch:  45


0.15060849125320847 0.1251028882605689
Epoch:  46


0.15073564366714373 0.12509576018367494
Epoch:  47


0.1499012245519741 0.12463087482111794
Epoch:  48


0.15003035116840052 0.12480038723775319
Epoch:  49


0.150760498401281 0.124496870807239
Epoch:  50


0.14962244033813477 0.12425646079438073
Epoch:  51


0.15008812740042404 0.12488193171364921
Epoch:  52


0.14934068757134514 0.12468268083674568
Epoch:  53


0.14894519222749247 0.12420000774519784
Epoch:  54


0.15021220493961024 0.12391725395406995
Epoch:  55


0.1489819203679626 0.12432742863893509
Epoch:  56


0.14955575925272865 0.12377821760518211
Epoch:  57


0.14922309767555547 0.12453172781637736
Epoch:  58


0.14794825702100187 0.12399553933313914
Epoch:  59


0.14922311620132342 0.12388589871781212
Epoch:  60


0.15015216695295797 0.12413777943168368
Epoch:  61


0.14920647039606766 0.12494820782116481
Epoch:  62


0.1489487471612724 0.12434822640248708
Epoch    62: reducing learning rate of group 0 to 1.0000e-05.
Epoch:  63


0.14776696546657667 0.1239736367549215
Epoch:  64


0.14820918561639013 0.12420334773404258
Epoch:  65


0.14846155732064634 0.12400107830762863
Epoch:  66


0.14849598907135628 0.1234730961067336
Epoch:  67


0.148773337940912 0.12377612612077168
Epoch:  68


0.14933944151208206 0.12368108438594001
Epoch:  69


0.14805740883221497 0.1237631578530584
Epoch:  70


0.14776360908070127 0.12327345567090171
Epoch:  71


0.1490605614475302 0.12358567437955312
Epoch:  72


0.14940610930726334 0.12350380527121681
Epoch:  73


0.147966187547993 0.12339084701878685
Epoch:  74


0.14824218886929588 0.1235194770353181
Epoch:  75


0.14859978451922135 0.12359563899891716
Epoch:  76


0.14755532185773593 0.12341480595724923
Epoch    76: reducing learning rate of group 0 to 1.0000e-06.
Epoch:  77


0.14864051180916862 0.12357273484979357
Epoch:  78


0.1496763293807571 0.12311139702796936
Epoch:  79


0.14825913914152095 0.12365203882966723
Epoch:  80


0.1494440710222399 0.1234825924038887
Epoch:  81


0.14812893762781815 0.12356301077774592
Epoch:  82


0.14890787770619263 0.12363147735595703
Epoch:  83


0.14780270892220573 0.12354019390685218
Epoch:  84


0.14862332231289632 0.12368501935686384
Epoch    84: reducing learning rate of group 0 to 1.0000e-07.
Epoch:  85


0.1470484061015619 0.12368942903620857
Epoch:  86


0.14819138517250885 0.12354373719011034
Epoch:  87


0.14804207674554876 0.12334162529025759
Epoch:  88


0.14909749377418208 0.12326512272868838
Epoch:  89


0.1481157227142437 0.12384295144251414
Epoch:  90


0.14907769657470085 0.12352732143231801
Epoch    90: reducing learning rate of group 0 to 1.0000e-08.
Epoch:  91


0.1471437348707302 0.12332687633378166
Epoch:  92


0.14866535486401738 0.12373614204781395
Epoch:  93


0.1475348114161878 0.12340786733797618
Epoch:  94


0.14817793103488716 0.12350942620209285
Epoch:  95


0.14882971346378326 0.12334136239119939
Epoch:  96


0.14815274968340592 0.12343086408717292
Epoch:  97


0.14863174307990717 0.12389550570930753
Epoch:  98


0.14896334909104011 0.12317248008080892
Epoch:  99


0.14891532748132139 0.12358613737991878
