In [12]:
import pickle 
import pandas as pd
import xgboost as xgb
import random
import numpy as np
import os, sys

from sklearn.impute import SimpleImputer
from sklearn import preprocessing, model_selection
from sklearn import metrics

module_path = os.path.abspath(os.path.join('CATENets/'))
if module_path not in sys.path:
    sys.path.append(module_path)

import catenets.models.torch.pseudo_outcome_nets as cate_models_masks


def normalize_data(X_train):
    
    X_normalized_train = (X_train - np.min(X_train, axis=0)) / (np.max(X_train, axis=0) - np.min(X_train, axis=0))

    return X_normalized_train

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
fluid_cohort = pd.read_pickle("data/trauma_responder.pkl")

#
fluid_cohort = fluid_cohort[fluid_cohort.columns.drop(list(fluid_cohort.filter(regex='proc')))]
fluid_cohort = fluid_cohort[fluid_cohort.columns.drop(list(fluid_cohort.filter(regex='ethnicity')))]
fluid_cohort = fluid_cohort[fluid_cohort.columns.drop(list(fluid_cohort.filter(regex='residencestate')))]
fluid_cohort = fluid_cohort[fluid_cohort.columns.drop(list(fluid_cohort.filter(regex='toxicologyresults')))]


x = fluid_cohort.loc[:, ~fluid_cohort.columns.isin(["registryid",
                                                            "COV",
                                                            "TT",
                                                            "scenegcsmotor",
                                                            "scenegcseye",
                                                            "scenegcsverbal",
                                                            "edgcsmotor",
                                                            "edgcseye",
                                                            "edgcsverbal",
                                                            "outcome",
                                                            "sex_F",
                                                            "traumatype_P",
                                                            "traumatype_other"
                                                            ])]

# x = fluid_cohort.loc[:, ~fluid_cohort.columns.isin(["registryid",
#                                                             "COV",
#                                                             "TT", 
#                                                             "scenegcsmotor",
#                                                             "scenegcseye",
#                                                             "scenegcsverbal",
#                                                             "edgcsmotor",
#                                                             "edgcseye",
#                                                             "edgcsverbal",
#                                                             "outcome"])]

### normalize x_train 
#x = x_train.values 

n, feature_size = x.shape
names = x.drop(["treated"], axis=1).columns
treatment_index = x.columns.get_loc("treated")
sex_index = x.columns.get_loc("sex_M")

var_index = [i for i in range(feature_size) if i != treatment_index]

x_norm = normalize_data(x)

## impute missing value

imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(x_norm)
x_train_scaled = imp.transform(x_norm)

X_train, X_test, y_train, y_test = model_selection.train_test_split(
                                             x_train_scaled,  
                                             fluid_cohort["outcome"], 
                                             test_size=0.2, 
                                             random_state=10,
                                             stratify=fluid_cohort["treated"]
                                    )

w_train = X_train[:, treatment_index]
w_test =  X_test[:, treatment_index]
X_train = X_train[:,var_index]
X_test = X_test[:, var_index]

In [34]:
results_train = pickle.load(open("results/responder/result_train_responder_xlearner.pkl", "rb"))
results_test = pickle.load(open("results/responder/result_test_responder_xlearner.pkl", "rb"))

results_train = results_train.reshape(5, -1)
results_test = results_test.reshape(5, -1)

print(np.mean(results_train), np.std(results_train)/np.sqrt(results_train.shape[1]))
print(np.mean(results_test), np.std(results_test)/np.sqrt(results_test.shape[1]))

estimated_ate_train = np.mean(results_train, axis=0)
estimated_ate_test = np.mean(results_test, axis=0)

threshold = np.mean(estimated_ate_train)

gender = x.columns.get_loc("sex_M")
cause_MV = x.columns.get_loc("causecode_MV")
cause_GUN = x.columns.get_loc("causecode_GUN")
scenegcs = x.columns.get_loc("scenegcs")
trauma = x.columns.get_loc("traumatype_B")
scenefirstbloodpressure = x.columns.get_loc("scenefirstbloodpressure")


lac  = x.columns.get_loc("LAC")
inr  = x.columns.get_loc("INR")
hgb  = x.columns.get_loc("HGB")
ph = x.columns.get_loc("PH")
fio2 = x.columns.get_loc("FIO2")
na = x.columns.get_loc("NA")
temp = x.columns.get_loc("temps2")
bd = x.columns.get_loc("BD")
pulse = x.columns.get_loc("edfirstpulse")



y_true_train = (estimated_ate_train > threshold)
y_true_test = (estimated_ate_test > threshold)

top_2_train = X_train[:, [trauma ,cause_MV, scenegcs, cause_GUN ]]
top_2_test = X_test[:, [trauma ,cause_MV, scenegcs, cause_GUN]]

xgb_model = xgb.XGBClassifier(objective="binary:logistic")
xgb_model.fit(top_2_train, y_true_train)

y_pred = xgb_model.predict(top_2_test)
y_pred_prob = xgb_model.predict_proba(top_2_test)


ips_score = np.reciprocal(y_pred_prob[y_pred==1][:,1])

IPS_ate = np.sum(estimated_ate_test[y_pred == 1]*ips_score)/n
ate = np.sum(estimated_ate_test[y_pred == 1])/n

auroc = metrics.roc_auc_score(y_true_test, y_pred)

print("original", np.sum(estimated_ate_test[w_test==1])/n)

print("shap",auroc)
print("shap - ate:", ate, IPS_ate)

top_2_train = X_train[:, [temp, ph, bd, hgb, pulse ]]
top_2_test = X_test[:, [temp, ph, bd, hgb, pulse ]]

xgb_model = xgb.XGBClassifier(objective="binary:logistic")
xgb_model.fit(top_2_train, y_true_train)

y_pred = xgb_model.predict(top_2_test)
y_pred_prob = xgb_model.predict_proba(top_2_test)


ips_score = np.reciprocal(y_pred_prob[y_pred==1][:,1])

IPS_ate = np.sum(estimated_ate_test[y_pred == 1]*ips_score)/n
ate = np.sum(estimated_ate_test[y_pred == 1])/n

auroc = metrics.roc_auc_score(y_true_test, y_pred)

print("shap- 0",auroc)
print("shap-0 - ate", ate, IPS_ate)

top_2_train = X_train[:, [ph, na, temp, gender, fio2 ]]
top_2_test = X_test[:, [ph, na, temp, gender, fio2 ]]

xgb_model = xgb.XGBClassifier(objective="binary:logistic")
xgb_model.fit(top_2_train, y_true_train)

y_pred = xgb_model.predict(top_2_test)
y_pred_prob = xgb_model.predict_proba(top_2_test)


ips_score = np.reciprocal(y_pred_prob[y_pred==1][:,1])

IPS_ate = np.sum(estimated_ate_test[y_pred == 1]*ips_score)/n

auroc = metrics.roc_auc_score(y_true_test, y_pred)
ate = np.sum(estimated_ate_test[y_pred == 1])/n

print("ig",auroc)
print("ig - ate", ate, IPS_ate)

top_2_train = X_train[:, [lac, inr, hgb,scenefirstbloodpressure ]]
top_2_test = X_test[:, [lac, inr,hgb, scenefirstbloodpressure ]]

xgb_model = xgb.XGBClassifier(objective="binary:logistic")
xgb_model.fit(top_2_train, y_true_train)

y_pred = xgb_model.predict(top_2_test)
y_pred_prob = xgb_model.predict_proba(top_2_test)


ips_score = np.reciprocal(y_pred_prob[y_pred==1][:,1])

IPS_ate = np.sum(estimated_ate_test[y_pred == 1]*ips_score)/n
auroc = metrics.roc_auc_score(y_true_test, y_pred)
ate = np.sum(estimated_ate_test[y_pred == 1])/n

print("clnical", auroc)
print("clinical - ate", ate, IPS_ate)

xgb_model = xgb.XGBClassifier(objective="binary:logistic")
xgb_model.fit(X_train, y_true_train)

y_pred = xgb_model.predict(X_test)
y_pred_prob = xgb_model.predict_proba(X_test)


ips_score = np.reciprocal(y_pred_prob[y_pred==1][:,1])

IPS_ate = np.sum(estimated_ate_test[y_pred == 1]*ips_score)/n
ate = np.sum(estimated_ate_test[y_pred == 1])/n
auroc = metrics.roc_auc_score(y_true_test, y_pred)

print("full feature", auroc)
print("full feature - ate", ate, IPS_ate)


random_feature = np.random.randint(X_train.shape[1], size=(4))

top_2_train = X_train[:, random_feature]
top_2_test = X_test[:, random_feature]

xgb_model = xgb.XGBClassifier(objective="binary:logistic")
xgb_model.fit(top_2_train, y_true_train)

y_pred = xgb_model.predict(top_2_test)
y_pred_prob = xgb_model.predict_proba(top_2_test)

ips_score = np.reciprocal(y_pred_prob[y_pred==1][:,1])

IPS_ate = np.sum(estimated_ate_test[y_pred == 1]*ips_score)/n
ate = np.sum(estimated_ate_test[y_pred == 1])/n

auroc = metrics.roc_auc_score(y_true_test, y_pred)

print("random feature", auroc)
print("random feature - ate", ate, IPS_ate)

0.014403417111102866 0.01822509105776346
0.019069704216733378 0.03517305039331138
original 0.003058094507370424
shap 0.7145713788906277
shap - ate: 0.012433164428836112 0.01857265414739243
shap- 0 0.547209181011998
shap-0 - ate 0.0025370662854015042 0.0031831639773518714
ig 0.5304294905233872
ig - ate 0.0045667647204959425 0.006398161473951002
clnical 0.5532950791166754
clinical - ate 0.0027846100569576303 0.0027504182285514166
full feature 0.6969222743870631
full feature - ate 0.011469007882735005 0.013067139437500694
random feature 0.5305164319248825
random feature - ate 0.00393486515912123 0.004632371067859249


In [23]:
## IPS 

top_2_train = X_train[:, [trauma ,cause_MV, scenegcs, cause_GUN ]]
top_2_test = X_test[:, [trauma ,cause_MV, scenegcs, cause_GUN]]

xgb_model = xgb.XGBClassifier(objective="binary:logistic")
xgb_model.fit(top_2_train, y_true_train)

y_pred = xgb_model.predict(top_2_test)
y_pred_prob = xgb_model.predict_proba(top_2_test)

IPS_ate = np.sum(estimated_ate_test[y_pred == 1]*y_pred_prob[y_pred==1][:,1])/n
ate = np.sum(estimated_ate_test[y_pred == 1])/n

In [24]:
ate

0.012433164428836112

In [25]:
IPS_ate

0.008428497227079997