In [1]:
# Paper: https://onlinelibrary.wiley.com/doi/full/10.1002/sim.9234
# Python Notebook: https://github.com/migariane/TutorialComputationalCausalInferenceEstimators/blob/main/PythonCodeBoxes.ipynb

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import marginaleffects

In [3]:
df = pd.read_csv("rhc.csv", usecols=["rhc", "death_d30", "sex", "age", "edu", "race", "carcinoma"])
df.head()
df.shape

Unnamed: 0,carcinoma,age,sex,edu,death_d30,rhc,race
0,Yes,70.250999,Male,12.0,0,No,White
1,No,78.179001,Female,12.0,0,Yes,White
2,Yes,46.091999,Female,14.07,0,Yes,White
3,No,75.332001,Female,9.0,0,No,White
4,No,67.910004,Male,9.945,1,Yes,White


(5735, 7)

In [4]:
# Box 2: Adjusted Regression
model = smf.glm("death_d30 ~ rhc + sex", data=df).fit()
print(model.summary())
print("Adjusted Regression Estimate:", model.params["rhc[T.Yes]"])

                 Generalized Linear Model Regression Results                  
Dep. Variable:              death_d30   No. Observations:                 5735
Model:                            GLM   Df Residuals:                     5732
Model Family:                Gaussian   Df Model:                            2
Link Function:               Identity   Scale:                         0.22142
Method:                          IRLS   Log-Likelihood:                -3812.9
Date:                Sat, 01 Jun 2024   Deviance:                       1269.2
Time:                        23:10:10   Pearson chi2:                 1.27e+03
No. Iterations:                     3   Pseudo R-squ. (CS):           0.005767
Covariance Type:            nonrobust                                         
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept       0.3049      0.010     29.354      

In [5]:
# Box 7-9: Parametric regression adjustment implementation of the g-formula
model2 = smf.glm("death_d30 ~ rhc * sex", data=df).fit()
print(model2.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:              death_d30   No. Observations:                 5735
Model:                            GLM   Df Residuals:                     5731
Model Family:                Gaussian   Df Model:                            3
Link Function:               Identity   Scale:                         0.22145
Method:                          IRLS   Log-Likelihood:                -3812.7
Date:                Sat, 01 Jun 2024   Deviance:                       1269.1
Time:                        23:10:10   Pearson chi2:                 1.27e+03
No. Iterations:                     3   Pseudo R-squ. (CS):           0.005829
Covariance Type:            nonrobust                                         
                             coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------
Intercept                  0

In [6]:
marginaleffects.avg_comparisons(model2, variables="rhc").to_pandas()

Unnamed: 0,term,contrast,estimate,std_error,statistic,p_value,s_value,conf_low,conf_high
0,rhc,mean(Yes) - mean(No),0.073692,0.012813,5.75128,8.857017e-09,26.750532,0.048579,0.098805


In [7]:
# Box 10-12: Parametric multivariable regression adjustment implementation of the g-formula
formula = "death_d30 ~ rhc + sex + age + edu + C(race) + C(carcinoma) + " \
          "rhc:sex + rhc:age + rhc:edu + rhc:C(race) + rhc:C(carcinoma)"
f = sm.families.family.Binomial()
model3 = smf.glm(formula, data=df, family=f).fit()
print(model3.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:              death_d30   No. Observations:                 5735
Model:                            GLM   Df Residuals:                     5719
Model Family:                Binomial   Df Model:                           15
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -3561.6
Date:                Sat, 01 Jun 2024   Deviance:                       7123.2
Time:                        23:10:10   Pearson chi2:                 5.72e+03
No. Iterations:                     4   Pseudo R-squ. (CS):            0.03198
Covariance Type:            nonrobust                                         
                                     coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------------
Intercept   

In [8]:
marginaleffects.avg_comparisons(model3, variables="rhc").to_pandas()

Unnamed: 0,term,contrast,estimate,std_error,statistic,p_value,s_value,conf_low,conf_high
0,rhc,mean(Yes) - mean(No),0.083929,0.012862,6.525312,6.786016e-11,33.778644,0.05872,0.109138
