# ACP Project - Systematic Model Comparison

In [1]:
import warnings, pickle, os, itertools
from dataclasses import dataclass
from joblib import Parallel, delayed, parallel_backend

try:
    from sklearnex import patch_sklearn
    patch_sklearn()
except ImportError:
    pass

import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 300)

from IPython.display import display
import matplotlib.pyplot as plt

import seaborn as sns
sns.set(rc={'figure.figsize':(10,10)})

import shap
import optuna, sqlalchemy
optuna.logging.set_verbosity(optuna.logging.WARNING)

%load_ext autoreload
%autoreload 1

In [2]:
from utils.evaluation import get_metrics, get_threshold_fpr
%aimport utils.evaluation

In [3]:
from systematic_comparison import  *
from models import * 
%aimport systematic_comparison, models

In [4]:
from sklearn.model_selection import train_test_split
from dataset import SCIData, SCICols
%aimport dataset

sci = SCIData.load('data/sci.h5')

scii = (
    SCIData(SCIData.quickload("data/sci_processed.h5").sort_values("AdmissionDateTime"))
    .mandate(SCICols.news_data_raw)
    .derive_critical_event(within=1, return_subcols=True)
    .augment_shmi(onehot=True)
    .derive_ae_diagnosis_stems(onehot=False)
)

sci_train, sci_test, _, y_test_mortality, _, y_test_criticalcare = train_test_split(
    scii,
    scii.DiedWithinThreshold,
    scii.CriticalCare,
    test_size=0.33,
    random_state=42,
    shuffle=False,
)
sci_train, sci_test = SCIData(sci_train), SCIData(sci_test)
# (X_train, y_train), (X_test, y_test) = (
#     sci_train.xy(outcome="CriticalEvent", dropna=False, fillna=False),
#     sci_test.xy(outcome="CriticalEvent", dropna=False, fillna=False),
# )

In [5]:
optuna.logging.set_verbosity(optuna.logging.INFO)
studies = study_grid(
    estimators=[Estimator_IsolationForest],
    resamplers=[None],
    scii=scii,
    outcome_thresholds=[1],
    features=scii.feature_group_combinations
)

for _ in studies[0:]:
    s = construct_study(**_, scii=SCIData(scii.sample(10000)), cv_jobs=5)

    r = s(n_trials=2, model_persistence_path='models/test/')
    


[32m[I 2022-11-14 15:24:38,499][0m A new study created in memory with name: IsolationForest_None_Within-1_news[0m
[32m[I 2022-11-14 15:24:53,069][0m Trial 0 finished with value: 0.1906763821299505 and parameters: {'IsolationForest__n_estimators': 153, 'IsolationForest__max_samples': 0.8801704338759069, 'IsolationForest__max_features': 0.5676133891506027, 'IsolationForest__bootstrap': False}. Best is trial 0 with value: 0.1906763821299505.[0m
[32m[I 2022-11-14 15:24:54,033][0m Trial 1 finished with value: 0.1879363013243021 and parameters: {'IsolationForest__n_estimators': 102, 'IsolationForest__max_samples': 0.635656589718247, 'IsolationForest__max_features': 0.22933077808514724, 'IsolationForest__bootstrap': False}. Best is trial 0 with value: 0.1906763821299505.[0m
[32m[I 2022-11-14 15:25:40,133][0m Trial 0 finished with value: 0.19583857476302302 and parameters: {'IsolationForest__n_estimators': 151, 'IsolationForest__max_samples': 0.47801235969139877, 'IsolationForest__m

IndexError: index -2 is out of bounds for axis 0 with size 1

In [41]:
X_train, X_test, y_train, y_test = get_xy(scii, Estimator_OneClassSVM, scii.feature_group_combinations['news'])
X_train = X_train[~y_train]
y_train = y_train[X_train.index]

In [42]:
model = PipelineFactory(Estimator_OneClassSVM, None, SCIData(X_train), y_train)()
model.set_params(**dict(
    OneClassSVM__verbose=1,
    OneClassSVM__tol=1e-12, 
    OneClassSVM__nu=1e-3, 
    OneClassSVM__learning_rate='adaptive', 
    OneClassSVM__eta0=100, 
    OneClassSVM__average=1e5
))
model.fit(X_train)

-- Epoch 1
Norm: 12.74, NNZs: 7, Bias: 193.600000, T: 79074, Avg. loss: 0.052632
Total training time: 0.01 seconds.
-- Epoch 2
Norm: 0.12, NNZs: 7, Bias: 86.200000, T: 158148, Avg. loss: 0.093937
Total training time: 0.02 seconds.
-- Epoch 3
Norm: 0.01, NNZs: 7, Bias: 278.800000, T: 237222, Avg. loss: 0.124226
Total training time: 0.03 seconds.
-- Epoch 4
Norm: 0.00, NNZs: 7, Bias: 171.400000, T: 316296, Avg. loss: 0.083996
Total training time: 0.04 seconds.
-- Epoch 5
Norm: 0.00, NNZs: 7, Bias: 164.000000, T: 395370, Avg. loss: 0.105799
Total training time: 0.04 seconds.
-- Epoch 6
Norm: 0.00, NNZs: 6, Bias: 56.600000, T: 474444, Avg. loss: 0.127316
Total training time: 0.05 seconds.
-- Epoch 7
Norm: 0.00, NNZs: 7, Bias: 35.120000, T: 553518, Avg. loss: 0.020235
Total training time: 0.06 seconds.
-- Epoch 8
Norm: 1.30, NNZs: 7, Bias: 33.640000, T: 632592, Avg. loss: 0.019725
Total training time: 0.07 seconds.
-- Epoch 9
Norm: 0.00, NNZs: 7, Bias: 32.160000, T: 711666, Avg. loss: 0.022

In [18]:
from sklearn.metrics import roc_auc_score

In [43]:
roc_auc_score(y_test, -model.decision_function(X_test))

0.27208706810772665

In [45]:
from sklearn.metrics import roc_auc_score

In [54]:
(-OneClassSVMWrapper().fit(X).decision_function(X)).min()

-0.0536743464871563

In [62]:
(SGDOneClassSVM().fit(X).predict(X)==-1).sum()

35620

In [49]:
roc_auc_score(y, -OneClassSVMWrapper().fit(X).decision_function(X))

0.7942450985107733

In [20]:
Estimator_OneClassSVM._estimator.decision_function()

<function models.OneClassSVMWrapper.decision_function(self, X)>

In [None]:
# optuna.logging.set_verbosity(optuna.logging.INFO)
# studies = [construct_study(**_, storage='sqlite:///models/studies.db', n_trials=2, sci_train=SCIData(sci_train.head(1000)), sci_test=SCIData(sci_test.head(1000))) for _ in get_studies(sci_train, study_grid)[:5]]
# with parallel_backend("loky", inner_max_num_threads=1):
#             results = Parallel(n_jobs=1)(
#                 delayed(_)(n_trials=2) for _ in studies[:2]
#             )