In [1]:
import os, pickle, warnings, itertools
from pathlib import Path
from functools import partial 

import numpy as np
import pandas as pd

from tqdm import tqdm
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

import logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s"
)


%load_ext autoreload
%autoreload 1
%aimport salford_datasets.salford, salford_datasets.salford_raw, transformer_experiment.utils.embeddings, transformer_experiment.utils.finetuning, transformer_experiment.utils.shallow_classifiers
%aimport acd_experiment.base_dataset, acd_experiment.salford_adapter, acd_experiment.models, acd_experiment.sci, acd_experiment.systematic_comparison

from salford_datasets.salford import SalfordData, SalfordFeatures, SalfordPrettyPrint, SalfordCombinations
from acd_experiment.salford_adapter import SalfordAdapter

In [2]:
class Notebook:
    DATA_DIR = Path('data/Salford')
    CACHE_DIR = Path('data/cache')
    SYSTEMATIC_COMPARISON_DIR = Path('data/systematic_comparison/')
    RE_DERIVE = False

In [4]:
from transformer_experiment.utils.shallow_classifiers import load_salford_dataset, get_train_test_indexes

SAL = load_salford_dataset(Notebook.RE_DERIVE, Notebook.DATA_DIR)
SAL_TRAIN_IDX, SAL_TEST_IDX, SAL_TEST_UNSEEN_IDX, SAL_TEST_IS_UNSEEN = get_train_test_indexes(SAL)

2023-04-08 00:07:07,937 [INFO] Loading processed dataset


In [5]:
from acd_experiment.models import Estimator_L1Regression, Estimator_LinearSVM, Estimator_LightGBM, Estimator_L2Regression, Estimator_ElasticNetRegression, Estimator_XGBoost

FEATURE_GROUP_CORRESPONDENCE = {
    'news': 'news',
    'with_phenotype': 'news_with_phenotype',
    'with_composites': 'with_labs',
    'with_labs': 'with_notes_and_labs',
    'with_services': 'with_notes_labs_and_hospital'
}

PRETTY_PRINT_FEATURE_GROUPS = {
    'Reference': 'Reference',
    'news': 'Vitals',
    'with_phenotype': '& Obs',
    'with_composites': '& Scores',
    'with_labs': '& Labs',
    'with_services': '& Services',
}

ESTIMATORS = {_._name: _ for _ in [
    Estimator_LightGBM,
    Estimator_XGBoost,
    Estimator_LinearSVM,
    Estimator_L1Regression,
    Estimator_L2Regression,
    Estimator_ElasticNetRegression,
]}

STUDY_GRID = list(itertools.product(ESTIMATORS.keys(), FEATURE_GROUP_CORRESPONDENCE.keys()))

In [249]:
from acd_experiment.salford_adapter import SalfordAdapter
from sklearn.calibration import CalibratedClassifierCV
import optuna
from acd_experiment.systematic_comparison import get_xy, PipelineFactory

def run_pretuned(sal, estimator_name, feature_group_name, cv_jobs=4, explain_models=['LightGBM']):
    params = optuna.load_study(
        study_name =f'{estimator_name}_None_Within-1_{FEATURE_GROUP_CORRESPONDENCE[feature_group_name]}', storage=f'sqlite:///{Notebook.SYSTEMATIC_COMPARISON_DIR}/{estimator_name}.db'
    ).best_params
    
    estimator = ESTIMATORS[estimator_name]

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        X, y = SalfordAdapter(sal).xy(
            x=SalfordCombinations[feature_group_name],
            imputation = estimator._requirements['imputation'],
            fillna = estimator._requirements['fillna'],
            ordinal_encoding = estimator._requirements['ordinal'],
            onehot_encoding = estimator._requirements['onehot']
        )
    X_train, y_train = SalfordAdapter(X.loc[SAL_TRAIN_IDX]), y.loc[SAL_TRAIN_IDX].values

    pipeline_factory = PipelineFactory(
        estimator=estimator, resampler=None, X_train=X_train, y_train=y_train,
    )

    model = CalibratedClassifierCV(
        pipeline_factory(**params), cv=cv_jobs, method="isotonic", n_jobs=cv_jobs,
    ).fit(X_train, y_train)


    y_pred_proba = model.predict_proba(X.loc[SAL_TEST_IDX])[:,1]
    y_pred_proba_unseen = y_pred_proba[SAL_TEST_IS_UNSEEN]
    y_pred_proba_precovid = y_pred_proba[SAL_TEST_IS_PRECOVID]

    explanations = None
    if estimator_name in explain_models:
        explanations = estimator.explain_calibrated(
            model, X_train, SalfordAdapter(X.loc[SAL_TEST_IDX]), cv_jobs=cv_jobs
        )

    return y_pred_proba, y_pred_proba_unseen, y_pred_proba_precovid, explanations

if Notebook.RE_DERIVE or True:
    RESULTS = {}
    for estimator_name, feature_group_name in (pbar := tqdm(STUDY_GRID)):
        pbar.set_description(f'Training {estimator_name} on {feature_group_name}')
        RESULTS[(estimator_name, feature_group_name)] = run_pretuned(SAL, estimator_name, feature_group_name)

        with open(Notebook.CACHE_DIR/'shallow_results_2.bin', 'wb') as file:
            pickle.dump(RESULTS, file)
else:
    with open(Notebook.CACHE_DIR/'shallow_results_2.bin', 'rb') as file:
            RESULTS = pickle.load(file)

Training ElasticNetRegression on with_labs:  93%|█████████▎| 28/30 [2:45:22<08:15, 247.94s/it]      The max_iter was reached which means the coef_ did not converge
Training ElasticNetRegression on with_services: 100%|██████████| 30/30 [3:23:39<00:00, 407.33s/it]


## Metrics

In [286]:
from transformer_experiment.utils.shallow_classifiers import get_metrics

def get_full_metrics_tables(results):
    y_trues = {
        'Complete': SAL.CriticalEvent.loc[SAL_TEST_IDX],
        'Unseen': SAL.CriticalEvent.loc[SAL_TEST_UNSEEN_IDX],
    }
    metrics = {
        'Complete': [],
        'Unseen': [],
    }
    for (estimator_name, feature_group_name), y_preds in results.items():
        for y_pred_proba, (y_true_name, y_true) in zip(y_preds, y_trues.items()):
            metrics[y_true_name].append(dict(
                Estimator = estimator_name,
                Features = feature_group_name,
            ) | get_metrics(
                y_true, y_pred_proba
            ))
    
    for y_true_name, y_true in y_trues.items():
        metrics[y_true_name].append(dict(
            Estimator='NEWS2',
            Features='Reference'
        ) | get_metrics(
            y_true, SAL.NEWS_Score_Admission.loc[y_true.index], y_pred_threshold=7
        ))

    return {
        y_true_name: pd.DataFrame(metric_list) for y_true_name, metric_list in metrics.items()
    }

METRICS = get_full_metrics_tables(RESULTS)

Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zer

In [292]:
def summary_metrics_select_estimators(metrics, estimators=['LightGBM', 'L2Regression']):
    df = pd.DataFrame(dict(
            Metric=metric,
            Estimator=_['Estimator'],
            Features=PRETTY_PRINT_FEATURE_GROUPS[_['Features']],
            Value=f"{_[metric]:.3f}",
            Dataset=dataset
        ) for dataset, df in metrics.items() for _ in df[df.Estimator.isin(estimators)].to_dict(orient='records') for metric in ('AUC', 'AP')
    ).sort_values(['Metric', 'Estimator', 'Dataset']).set_index(['Metric', 'Estimator', 'Dataset', 'Features']).unstack()['Value'][PRETTY_PRINT_FEATURE_GROUPS.values()]
    return df

summary_metrics_unseen_comparison(METRICS)

Unnamed: 0_level_0,Unnamed: 1_level_0,Features,Vitals,& Obs,& Scores,& Labs,& Services
Metric,Estimator,Dataset,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AP,L2Regression,Complete,0.103,0.163,0.175,0.209,0.603
AP,L2Regression,Unseen,0.115,0.183,0.197,0.232,0.645
AP,LightGBM,Complete,0.155,0.247,0.294,0.338,0.667
AP,LightGBM,Unseen,0.176,0.272,0.322,0.374,0.707
AUC,L2Regression,Complete,0.645,0.756,0.795,0.816,0.915
AUC,L2Regression,Unseen,0.643,0.746,0.793,0.818,0.92
AUC,LightGBM,Complete,0.749,0.824,0.861,0.886,0.943
AUC,LightGBM,Unseen,0.757,0.826,0.867,0.891,0.947


In [361]:
def detailed_metrics_all_estimators(metrics):
    df = pd.DataFrame(dict(
            Dataset=dataset,
            Metric=metric,
            Estimator=_['Estimator'],
            Features=PRETTY_PRINT_FEATURE_GROUPS[_['Features']],
            Summary = f'{_[metric]:.4f} ({_[metric+"_Lower"]:.4f}-{_[metric + "_Upper"]:.4f})'
        ) for dataset, df in metrics.items() for _ in df.to_dict(orient='records') for metric in ('AUC', 'AP')
    ).pivot(index=['Features', 'Estimator'], columns=['Metric', 'Dataset'], values='Summary')

    return df.loc[PRETTY_PRINT_FEATURE_GROUPS.values()][['AUC', 'AP']]

detailed_metrics_all_estimators(METRICS)

Unnamed: 0_level_0,Metric,AUC,AUC,AP,AP
Unnamed: 0_level_1,Dataset,Complete,Unseen,Complete,Unseen
Features,Estimator,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Reference,NEWS2,0.7395 (0.7278-0.7505),0.7396 (0.7295-0.7544),0.1325 (0.1232-0.1440),0.1470 (0.1381-0.1621)
Vitals,ElasticNetRegression,0.6441 (0.6332-0.6559),0.6421 (0.6298-0.6600),0.1028 (0.0941-0.1139),0.1148 (0.1053-0.1278)
Vitals,L1Regression,0.6433 (0.6325-0.6555),0.6412 (0.6294-0.6587),0.1031 (0.0945-0.1143),0.1150 (0.1055-0.1278)
Vitals,L2Regression,0.6446 (0.6338-0.6564),0.6428 (0.6307-0.6609),0.1032 (0.0945-0.1143),0.1152 (0.1052-0.1275)
Vitals,LightGBM,0.7493 (0.7375-0.7593),0.7568 (0.7472-0.7673),0.1550 (0.1438-0.1669),0.1756 (0.1645-0.1931)
Vitals,LinearSVM,0.6619 (0.6508-0.6730),0.6626 (0.6518-0.6784),0.1135 (0.1034-0.1252),0.1284 (0.1170-0.1415)
Vitals,XGBoost,0.7493 (0.7368-0.7592),0.7576 (0.7470-0.7693),0.1549 (0.1434-0.1670),0.1781 (0.1655-0.1960)
& Obs,ElasticNetRegression,0.7550 (0.7443-0.7666),0.7452 (0.7353-0.7592),0.1607 (0.1499-0.1702),0.1812 (0.1673-0.2010)
& Obs,L1Regression,0.7543 (0.7436-0.7660),0.7436 (0.7328-0.7579),0.1600 (0.1498-0.1699),0.1798 (0.1666-0.1991)
& Obs,L2Regression,0.7556 (0.7449-0.7673),0.7460 (0.7363-0.7601),0.1631 (0.1525-0.1723),0.1834 (0.1690-0.2032)
