In [7]:
import pandas as pd
import numpy as np
import sys
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
df = pd.read_csv("./data/admissions_processed_morphine_sulfate.csv")

In [3]:
def df_to_X(df):
    
    # include age and hospital expire flag
    covariates = ['age', 'HOSPITAL_EXPIRE_FLAG']
    X = df[covariates]
    
    # include onehots for diagnosis
    diagnosis = pd.get_dummies(df.DIAGNOSIS)
    diagnosis.columns = ['DIAGNOSIS:%s' %d for d in diagnosis.columns]
    X = pd.concat([X, diagnosis], axis=1)
    
    # include duration of hosptial stay
    hosp_duration = (df['DISCHTIME'].astype('datetime64[ns]') - df['ADMITTIME'].astype('datetime64[ns]')).dt.days
    X['hosp_duration'] = hosp_duration
    
    # include onehots for insurance
    insur = pd.get_dummies(df.INSURANCE)
    insur.columns = ['INSURANCE:%s' %i for i in insur.columns]
    X = pd.concat([X, insur], axis=1)  

    
    # normalize duration because it is non-categorical
    d_mu = X['hosp_duration'].mean()
    d_std = X['hosp_duration'].std()
    X['hosp_duration'] = X['hosp_duration'].apply(lambda dp: (dp-d_mu)/d_std)

    # normalize age because non-categorical
    age_mu = X['age'].mean()
    age_std = X['age'].std()
    X['age'] = X['age'].apply(lambda age: (age-age_mu)/age_std)

    return X

def df_to_T(df, eth):
    return df['ETHNICITY'].apply(lambda x: int(x==eth))

def df_to_Y(df):
    return df['TOTAL_FORM_VAL_DISP_MAX']

In [10]:
X = df_to_X(df)
T = df_to_T(df, 'WHITE')
Y = df_to_Y(df)
print('X: ', X.shape)
print("T: ", T.shape)
print("Y: ", Y.shape)

X:  (6618, 122)
T:  (6618,)
Y:  (6618,)


In [9]:
T = df_to_T(df, 'WHITE')

clf = LogisticRegression().fit(X, T)

treated = np.where(T==1)[0]
no_treated = np.where(T==0)[0]

predict = list(range(len(X)))
for i in range(len(X)):
    predict[i] = clf.predict_proba([X.iloc[i]])
    
ATE = 0
for i in treated:
    ATE += Y[i]/predict[i][0][1]
ans = ATE/len(df)
print("treated ATE with inverse propensity: ", ans)

ATE2 = 0
for i in no_treated:
    ATE2 += Y[i]/predict[i][0][0]
ans2 = ATE2/len(df)
print("no treated ATE with inverse propensity: ", ans2)

print("difference between treated and no treated: ", ans - ans2)



treated ATE with inverse propensity:  1.9243670273957083
no treated ATE with inverse propensity:  1.7621489194068773
difference between treated and no treated:  0.16221810798883096


In [11]:
T = df_to_T(df, 'BLACK')

clf = LogisticRegression().fit(X, T)

treated = np.where(T==1)[0]
no_treated = np.where(T==0)[0]

predict = list(range(len(X)))
for i in range(len(X)):
    predict[i] = clf.predict_proba([X.iloc[i]])
    
ATE = 0
for i in treated:
    ATE += Y[i]/predict[i][0][1]
ans = ATE/len(df)
print("treated ATE with inverse propensity: ", ans)

ATE2 = 0
for i in no_treated:
    ATE2 += Y[i]/predict[i][0][0]
ans2 = ATE2/len(df)
print("no treated ATE with inverse propensity: ", ans2)

print("difference between treated and no treated: ", ans - ans2)



treated ATE with inverse propensity:  1.7163226794520907
no treated ATE with inverse propensity:  1.9228039884806178
difference between treated and no treated:  -0.20648130902852713


In [12]:
T = df_to_T(df, 'ASIAN')

clf = LogisticRegression().fit(X, T)

treated = np.where(T==1)[0]
no_treated = np.where(T==0)[0]

predict = list(range(len(X)))
for i in range(len(X)):
    predict[i] = clf.predict_proba([X.iloc[i]])
    
ATE = 0
for i in treated:
    ATE += Y[i]/predict[i][0][1]
ans = ATE/len(df)
print("treated ATE with inverse propensity: ", ans)

ATE2 = 0
for i in no_treated:
    ATE2 += Y[i]/predict[i][0][0]
ans2 = ATE2/len(df)
print("no treated ATE with inverse propensity: ", ans2)

print("difference between treated and no treated: ", ans - ans2)



treated ATE with inverse propensity:  1.7837247968392604
no treated ATE with inverse propensity:  1.9154913152470474
difference between treated and no treated:  -0.13176651840778697


In [13]:
T = df_to_T(df, 'HISPANIC')

clf = LogisticRegression().fit(X, T)

treated = np.where(T==1)[0]
no_treated = np.where(T==0)[0]

predict = list(range(len(X)))
for i in range(len(X)):
    predict[i] = clf.predict_proba([X.iloc[i]])
    
ATE = 0
for i in treated:
    ATE += Y[i]/predict[i][0][1]
ans = ATE/len(df)
print("treated ATE with inverse propensity: ", ans)

ATE2 = 0
for i in no_treated:
    ATE2 += Y[i]/predict[i][0][0]
ans2 = ATE2/len(df)
print("no treated ATE with inverse propensity: ", ans2)

print("difference between treated and no treated: ", ans - ans2)



treated ATE with inverse propensity:  1.5354823597319571
no treated ATE with inverse propensity:  1.9241439205935043
difference between treated and no treated:  -0.38866156086154713
