In [None]:
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
import datetime
import itertools
import statistics

In [None]:
df = pd.read_csv('./dataset/data.csv').astype('float32')
df['label'] = df['label'].replace(21, 19)
print(df.head(4))

In [None]:
import torch

X = df.drop('label', axis=1)
y = df['label']

X = torch.tensor(X.values, dtype=torch.float32)
y = torch.tensor(y.values, dtype=torch.int).reshape(-1,1)

In [None]:
# Define ordinal criterion
from net_archs import MLP

def ordinal_criterion(predictions, targets):
  # Ordinal regression with encoding as in https://arxiv.org/pdf/0704.1028.pdf

  # Create out modified target with [batch_size, num_labels] shape
  modified_target = torch.zeros_like(predictions)

  # Fill in ordinal target function, i.e. 0 -> [1,0,0,...]
  for i, target in enumerate(targets):
    modified_target[i, 0:int(target)+1] = 1

  return nn.MSELoss(reduction='none')(predictions, modified_target).sum(axis=1)


In [None]:
from sklearn.model_selection import KFold

def fl_split(split, x_train, y_train):
    x_eq = []
    y_eq = []

    n_splits = 50
    skf = KFold(n_splits=n_splits)
    skf.get_n_splits(x_train, y_train)

    for i, (_, test_index) in enumerate(skf.split(x_train, y_train)):
        x_eq.append(x_train[test_index])
        y_eq.append(y_train[test_index])

    x_split = []
    y_split = []

    acc = 0
    for s in split:
        x_split.append(torch.cat(x_eq[acc:acc+int(s*n_splits)], 0))
        y_split.append(torch.cat(y_eq[acc:acc+int(s*n_splits)], 0))
        acc += int(s*n_splits)

    for i in range(len(x_split)):
        ros = RandomOverSampler(random_state=42)

        X_train_testing, y_train_testing = ros.fit_resample(x_split[i], y_split[i])
        y_train_testing = torch.reshape(torch.tensor(y_train_testing), (-1, 1))

        x_split[i] = torch.from_numpy(X_train_testing)
        y_split[i] = y_train_testing

    return x_split, y_split

In [None]:
# Define train and test function
from sklearn.metrics import mean_squared_error, roc_auc_score, f1_score

def train(net, optimizer, x, y, x_test, y_test, num_epoch=20, batch_size=8, criterion=ordinal_criterion):
  print_every = 10

  for n in range(num_epoch):
    # Mini batch sgd
    permutation = torch.randperm(x.size()[0])
    for i in range(0, x.size()[0], batch_size):
      indices = permutation[i:i+batch_size]
      x_mini, y_mini = x[indices], y[indices]
      y_pred = net(x_mini)
      loss = criterion(y_pred.float(), y_mini.float())
      optimizer.zero_grad()
      loss.mean().backward()
      optimizer.step()
    if (n + 1) % print_every == 0:
      print(f'Epoch: {n + 1}, Loss: {loss.sum()}')

      mse,_, auc, _ = test(net, x_test, y_test)
      print(f'AUC: {auc} mse {mse}')


def prediction2label(pred: np.ndarray):
  """Convert ordinal predictions to class labels, e.g.

  [0.9, 0.1, 0.1, 0.1] -> 0
  [0.9, 0.9, 0.1, 0.1] -> 1
  [0.9, 0.9, 0.9, 0.1] -> 2
  etc.
  """
  return (pred > 0.5).cumprod(axis=1).sum(axis=1) - 1

def test(net, x_test, y_test):
  y_pred = net(x_test)
  if y_pred.shape[1] == 20:
    y_pred = prediction2label(net(x_test))
  else:
    y_pred = torch.clip(y_pred.round(), min=0, max=20)

  y_default_test = np.where(y_test.detach().numpy() > 9, 1, 0)
  y_default_pred = np.where(y_pred.detach().numpy() > 9, 1, 0)

  mse = mean_squared_error(y_test.detach().numpy(), y_pred.detach().numpy())
  mse_norm = mean_squared_error(y_test.detach().numpy() / 20, y_pred.detach().numpy() / 20)
  auc = roc_auc_score(y_default_test, y_default_pred)
  f1 = f1_score(y_default_test, y_default_pred)
  return mse, mse_norm, auc, f1


In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from imblearn.over_sampling import RandomOverSampler


def run_local(split, X=X, y=y):

    mses = []
    aucs = []

    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    print(type(X))
    print(type(y))
    for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X, y)):
        print(f"Fold {fold_idx + 1}:")

        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        x_split, y_split = fl_split(split, X_train, y_train)

        for X_train, y_train in zip(x_split, y_split):
            model = MLP(22, 20, layer_size=64, num_of_layers=5, dropout=False)
            optimizer = torch.optim.Adam(model.parameters(), lr=0.001,weight_decay=0.00005)
            train(model, optimizer, X_train, y_train, X_test, y_test, num_epoch=256, batch_size=8)
            mse, mse_norm, auc, f1 = test(model.cpu(), X_test, y_test)
            mses.append(mse)
            aucs.append(auc)

    print('split',split)
    print('mse',mses)
    print('auc',aucs)


In [None]:
split =[1]
run_local(split)

In [None]:
split =[0.5, 0.5]
run_local(split)

In [None]:
split =[0.6, 0.4]
run_local(split)

In [None]:
split =[0.8, 0.2]
run_local(split)

In [None]:
split = [0.34, 0.33, 0.33]
run_local(split)

In [None]:
split = [0.6, 0.2, 0.2]
run_local(split)

In [None]:
split = [0.8, 0.1, 0.1]
run_local(split)

In [None]:
split = [0.2, 0.2, 0.2, 0.2, 0.2]
run_local(split)

In [None]:
split = [0.6, 0.1, 0.1, 0.1, 0.1]
run_local(split)

In [None]:
split = [0.8, 0.05, 0.05, 0.05, 0.05]
run_local(split)

In [None]:
split = [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]
run_local(split)

In [None]:
split = [0.6, 0.05, 0.05, 0.05, 0.05, 0.04, 0.04, 0.04, 0.04, 0.04]
run_local(split)

In [None]:
split = [0.8, 0.03, 0.03, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02]
run_local(split)