In [1]:
import pandas as pd
import xgboost as xgb
import random
import numpy as np
import os, sys
import matplotlib.pyplot as plt
import pickle as pkl

from typing import List
from sklearn.impute import SimpleImputer
from sklearn import preprocessing, model_selection
from sklearn import metrics

module_path = os.path.abspath(os.path.join('../CATENets/'))
if module_path not in sys.path:
    sys.path.append(module_path)


import catenets.models.torch.pseudo_outcome_nets as pseudo_outcome_nets

  from .autonotebook import tqdm as notebook_tqdm


In [5]:

def normalize_data(x_train):
    
    x_normalized_train = (x_train - np.min(x_train, axis=0)) / (np.max(x_train, axis=0) - np.min(x_train, axis=0))

    return x_normalized_train


def subgroup_ate(
    method: str,
    features: List[int],
    y_true_train: np.ndarray,
    y_true_test: np.ndarray,
    estimated_ate_test: np.ndarray,
    iss_test: np.ndarray
) -> None:
    
    xgb_model = xgb.XGBClassifier(  
        max_depth=3,
        reg_lambda=2,
        # min_split_loss=2
    )

    xgb_model.fit(x_train[:, features], y_true_train)

    y_pred = xgb_model.predict(x_test[:, features])
    y_pred_train = xgb_model.predict(x_train[:, features])
    
    ate = np.sum(estimated_ate_test[y_pred == 1])/len(estimated_ate_test)
    auroc = metrics.roc_auc_score(y_true_test, y_pred)
        
    print("===================")
    print("%s - auroc %s"%(method, auroc))
    print("%s - ATE %s"%(method, ate))

def feature_idx(
    method: str,
    cohort: str,
    learner: str
)-> List[int]:
    
    if method == "shap":
        file_path = f"../results/{cohort}/naive_shap_top_5_features_{learner}.csv"
    elif method == "ig":
        file_path = f"../results/{cohort}/integrated_gradients_top_5_features_{learner}.csv"
    elif method == "shap - 0 ":
        file_path = f"../results/{cohort}/shapley_value_sampling_top_5_features_{learner}.csv"
        
    df = pd.read_csv(file_path,keep_default_na=False)
    
    df_sorted = df.sort_values(
        by='count (%)', 
        ascending=False
    )
    print(df_sorted["feature"].head(5).tolist())
    
    indices  = [ x.columns.get_loc(col) for col in df_sorted["feature"].head(5) ]
    
    for i in indices:
        if i > treatment_index:
            i -= 1
    return indices

In [3]:
fluid_cohort = pd.read_pickle("../data/trauma_responder.pkl")


all_year = pd.read_csv("../data/all_year.csv", index_col=0)

fluid_cohort = pd.merge(fluid_cohort,all_year[['registryid','iss']],on='registryid', how='left')
fluid_cohort["iss"] = pd.to_numeric(fluid_cohort["iss"], errors='coerce')

#
fluid_cohort = fluid_cohort[fluid_cohort.columns.drop(list(fluid_cohort.filter(regex='proc')))]
fluid_cohort = fluid_cohort[fluid_cohort.columns.drop(list(fluid_cohort.filter(regex='ethnicity')))]
fluid_cohort = fluid_cohort[fluid_cohort.columns.drop(list(fluid_cohort.filter(regex='residencestate')))]
fluid_cohort = fluid_cohort[fluid_cohort.columns.drop(list(fluid_cohort.filter(regex='toxicologyresults')))]


x = fluid_cohort.loc[:, ~fluid_cohort.columns.isin(["registryid",
                                                            "COV",
                                                            "TT",
                                                            "scenegcsmotor",
                                                            "scenegcseye",
                                                            "scenegcsverbal",
                                                            "edgcsmotor",
                                                            "edgcseye",
                                                            "edgcsverbal",
                                                            "outcome",
                                                            "sex_F",
                                                            "traumatype_P",
                                                            "traumatype_other"
                                                            ])]

n, feature_size = x.shape
names = x.drop(["treated"], axis=1).columns
treatment_index = x.columns.get_loc("treated")
iss_index = x.columns.get_loc("iss")

var_index = [i for i in range(feature_size) if i != treatment_index and i != iss_index]

x_norm = normalize_data(x)

## impute missing value

imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(x_norm)
x_train_scaled = imp.transform(x_norm)

x_train, x_test, y_train, y_test = model_selection.train_test_split(
                                             x_train_scaled,  
                                             fluid_cohort["outcome"], 
                                             test_size=0.2, 
                                             random_state=42,
                                             stratify=fluid_cohort["treated"]
                                    )

x_train, x_val, y_train, y_val = model_selection.train_test_split(
                                             x_train,  
                                             y_train, 
                                             test_size=0.2, 
                                             random_state=42,
                                             stratify=x_train[:, treatment_index]
                                    )

w_train = x_train[:, treatment_index]
w_val = x_val[:, treatment_index]
w_test =  x_test[:, treatment_index]


iss_train = x_train[:, iss_index]
iss_test =  x_test[:, iss_index]

x_train = x_train[:,var_index]
x_val = x_val[:, var_index]
x_test = x_test[:, var_index]

  all_year = pd.read_csv("../data/all_year.csv", index_col=0)


In [6]:
results_train = pkl.load(open("../results/responder/train_xlearner.pkl", "rb"))
results_test = pkl.load(open("../results/responder/test_xlearner.pkl", "rb"))

print(np.mean(results_train), np.std(results_train)/np.sqrt(results_train.shape[1]))
print(np.mean(results_test), np.std(results_test)/np.sqrt(results_test.shape[1]))

estimated_ate_train = np.mean(results_train, axis=0)
estimated_ate_test = np.mean(results_test, axis=0)
threshold = np.mean(estimated_ate_train)

y_true_train = (estimated_ate_train > threshold)
y_true_test = (estimated_ate_test > threshold)

scenefirstbloodpressure = x.columns.get_loc("scenefirstbloodpressure")
lac  = x.columns.get_loc("LAC")
inr  = x.columns.get_loc("INR")
hgb  = x.columns.get_loc("HGB")

explainers = {
    
    "shap": feature_idx(
        "shap",   
        "responder",
        "xlearner"
        ),
    "shap - 0 ": feature_idx(
        "shap - 0 ",
        "responder",
        "xlearner"
    ), #[temp, ph, bd, hgb, pulse ]
    "ig": feature_idx(
        "ig",
        "responder" ,
        "xlearner"
    ), #[ph, na, temp, gender, fio2 ],
    
    "clinical": [lac, inr, hgb,scenefirstbloodpressure ],
    "full features": [ i for i in range(x_train.shape[1])],
    "random features": np.random.randint(x_train.shape[1], size=(5)),
}


print("mean ISS: ", np.mean(iss_test)*74+1)
print("original", np.sum(estimated_ate_test[w_test==1])/n)
print("original - iss", np.mean(iss_test[w_test==1])*74+1, np.mean(iss_test[w_test==0])*74+1)
print("===================================")

for explainer, features in explainers.items():
    subgroup_ate(
        explainer,
        features,
        y_true_train,
        y_true_test,
        estimated_ate_test,
        iss_test
    )

0.12698728051396643 0.019538631050279247
0.14159337346752995 0.034385373991550984
['sex_M', 'edgcs', 'traumatype_B', 'scenegcs', 'causecode_CUT']
['temps2', 'HGB', 'sex_M', 'PH', 'FIO2']
['temps2', 'scenefirstpulse', 'age', 'HGB', 'sex_M']
mean ISS:  29.377541074909495
original 0.027823786372953855
original - iss 29.918212981593268 21.7
shap - auroc 0.6911122751877073
shap - ATE 0.11168783061171111
shap - 0  - auroc 0.6092194866422211
shap - 0  - ATE 0.0845466122616352
ig - auroc 0.5537803387462895
ig - ATE 0.06903634075141254
clinical - auroc 0.535533438100227
clinical - ATE 0.08969280714181398
full features - auroc 0.7322332809498865
full features - ATE 0.11218239073450124
random features - auroc 0.607735288982015
random features - ATE 0.11644143819844192


In [7]:
results_train = pkl.load(open("../results/responder/train_xlearner.pkl", "rb"))
results_test = pkl.load(open("../results/responder/test_xlearner.pkl", "rb"))

print(np.mean(results_train), np.std(results_train)/np.sqrt(results_train.shape[1]))
print(np.mean(results_test), np.std(results_test)/np.sqrt(results_test.shape[1]))

estimated_ate_train = np.mean(results_train, axis=0)
estimated_ate_test = np.mean(results_test, axis=0)
threshold = np.mean(estimated_ate_train)

y_true_train = (estimated_ate_train > threshold)
y_true_test = (estimated_ate_test > threshold)

scenefirstbloodpressure = x.columns.get_loc("scenefirstbloodpressure")
lac  = x.columns.get_loc("LAC")
inr  = x.columns.get_loc("INR")
hgb  = x.columns.get_loc("HGB")

explainers = {
    
    "shap": feature_idx(
        "shap",   
        "responder",
        "ensemble"
        ),
    "shap - 0 ": feature_idx(
        "shap - 0 ",
        "responder",
        "ensemble"
    ), #[temp, ph, bd, hgb, pulse ]
    "ig": feature_idx(
        "ig",
        "responder" ,
        "ensemble"
    ), #[ph, na, temp, gender, fio2 ],
    
    "clinical": [lac, inr, hgb,scenefirstbloodpressure ],
    "full features": [ i for i in range(x_train.shape[1])],
    "random features": np.random.randint(x_train.shape[1], size=(5)),
}


print("mean ISS: ", np.mean(iss_test)*74+1)
print("original", np.sum(estimated_ate_test[w_test==1])/n)
print("original - iss", np.mean(iss_test[w_test==1])*74+1, np.mean(iss_test[w_test==0])*74+1)
print("===================================")

for explainer, features in explainers.items():
    subgroup_ate(
        explainer,
        features,
        y_true_train,
        y_true_test,
        estimated_ate_test,
        iss_test
    )

0.12698728051396643 0.019538631050279247
0.14159337346752995 0.034385373991550984
['age', 'sex_M', 'causecode_MC', 'causecode_CUT', 'traumatype_B']
['age', 'traumatype_B', 'FIB', 'PH', 'sex_M']
['traumatype_B', 'HGB', 'sex_M', 'temps2', 'HCT']
mean ISS:  29.377541074909495
original 0.027823786372953855
original - iss 29.918212981593268 21.7
shap - auroc 0.6937314475292474
shap - ATE 0.08998480277728767
shap - 0  - auroc 0.6284267504801816
shap - 0  - ATE 0.09135533581146246
ig - auroc 0.6079099004714511
ig - ATE 0.08970955345675495
clinical - auroc 0.535533438100227
clinical - ATE 0.08969280714181398
full features - auroc 0.7322332809498865
full features - ATE 0.11218239073450124
random features - auroc 0.570543041732146
random features - ATE 0.08892237886895217
