In [None]:
A data frame with 614 observations (185 treated, 429 control). There are 10 variables measured for each individual:

treat is the treatment assignment (1=treated, 0=control).
age is age in years.
educ is education in number of years of schooling.
black is an indicator for African-American (1=African-American, 0=not).
hispan is an indicator for being of Hispanic origin (1=Hispanic, 0=not).
married is an indicator for married (1=married, 0=not married).
nodegree is an indicator for whether the individual has a high school degree (1=no degree, 0=degree).
re74 is income in 1974, in U.S. dollars.
re75 is income in 1975, in U.S. dollars.
re78 is income in 1978, in U.S. dollars.

In [None]:
import pandas as pd
df = pd.read_csv('/Users/pranjal/Desktop/ci/data/lalonde.csv')
df.head()

In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors
import statsmodels.api as sm
import statsmodels.formula.api as smf
from stargazer.stargazer import Stargazer

X_bivariate = sm.add_constant(df['treat'])
y = df['re78']
model_bivariate = sm.OLS(y, X_bivariate).fit()

covariates = ['age', 'educ', 'black', 'hispan', 'married', 'nodegree', 're74', 're75']
X_adjustment = sm.add_constant(df[['treat'] + covariates])
model_adjustment = sm.OLS(y, X_adjustment).fit()

stargazer_model_based = Stargazer([model_bivariate, model_adjustment])
stargazer_model_based.title("Model-Based Regression Results")
stargazer_model_based.show_degrees_of_freedom(False)
stargazer_model_based.show_confidence_intervals(True)
display(stargazer_model_based)

0,1,2
,,
,Dependent variable: re78,Dependent variable: re78
,,
,(1),(2)
,,
age,,12.978
,,"(-50.827 , 76.783)"
black,,-1240.644
,,"(-2750.420 , 269.132)"
const,6984.170***,66.515


In [1]:
import numpy as np
from doubleml.datasets import fetch_bonus
from doubleml import DoubleMLData, DoubleMLPLR
from sklearn.base import clone
import lightgbm as lgb

df_bonus = fetch_bonus('DataFrame')
dml_data_bonus = DoubleMLData(df_bonus, y_col='inuidur1', d_cols='tg',
                              x_cols=['female', 'black', 'othrace', 'dep1', 'dep2'])
learner = lgb.LGBMRegressor(n_estimators=500, max_depth=5)
ml_l_bonus = clone(learner)
ml_m_bonus = clone(learner)
obj_dml_plr_bonus = DoubleMLPLR(dml_data_bonus, ml_l_bonus, ml_m_bonus)
obj_dml_plr_bonus.fit()
print(obj_dml_plr_bonus)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001238 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10
[LightGBM] [Info] Number of data points in the train set: 4079, number of used features: 5
[LightGBM] [Info] Start training from score 2.020761
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000429 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10
[LightGBM] [Info] Number of data points in the train set: 4079, number of used features: 5
[LightGBM] [Info] Start training from score 2.027678
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000429 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you c

In [52]:
propensity_model = LogisticRegression(max_iter=5000)
propensity_model.fit(df[covariates], df['treat'])
df['propensity_score'] = propensity_model.predict_proba(df[covariates])[:, 1]

trim_threshold = (0.25, 0.75)
lower_bound, upper_bound = df['propensity_score'].quantile(trim_threshold)
trimmed_df = df[(df['propensity_score'] >= lower_bound) & (df['propensity_score'] <= upper_bound)]

treated = trimmed_df[trimmed_df['treat'] == 1]
control = trimmed_df[trimmed_df['treat'] == 0]
nn = NearestNeighbors(n_neighbors=1)
nn.fit(control[['propensity_score']])
distances, indices = nn.kneighbors(treated[['propensity_score']])
matched_controls = control.iloc[indices.flatten()]
psm_treatment_effect = treated['re78'].mean() - matched_controls['re78'].mean()

print(f"PSM Treatment Effect (Trimmed): {psm_treatment_effect}")

def covariate_balance(df, covariates, treated_col='treat'):
    results = []
    for covariate in covariates:
        mean_treated = df[df[treated_col] == 1][covariate].mean()
        mean_control = df[df[treated_col] == 0][covariate].mean()
        var_treated = df[df[treated_col] == 1][covariate].var()
        var_control = df[df[treated_col] == 0][covariate].var()
        raw_diff = mean_treated - mean_control
        normalized_diff = raw_diff / np.sqrt((var_treated + var_control) / 2)
        results.append({
            "Covariate": covariate,
            "Mean (Treated)": mean_treated,
            "Mean (Control)": mean_control,
            "Raw Difference": raw_diff,
            "Normalized Difference": normalized_diff
        })
    return pd.DataFrame(results)

balance_before = covariate_balance(df, covariates)
balance_after = covariate_balance(trimmed_df, covariates)

print("Covariate Balance Before Matching:")
display(balance_before)

print("\nCovariate Balance After Matching:")
display(balance_after)

PSM Treatment Effect (Trimmed): 1996.7688311688325
Covariate Balance Before Matching:


Unnamed: 0,Covariate,Mean (Treated),Mean (Control),Raw Difference,Normalized Difference
0,age,25.816216,28.030303,-2.214087,-0.241904
1,educ,10.345946,10.235431,0.110515,0.044755
2,black,0.843243,0.202797,0.640446,1.667719
3,hispan,0.059459,0.142191,-0.082732,-0.27694
4,married,0.189189,0.512821,-0.323631,-0.719492
5,nodegree,0.708108,0.596737,0.111372,0.235048
6,re74,2095.573689,5619.236506,-3523.662818,-0.595752
7,re75,1532.055314,2466.484443,-934.429129,-0.287002



Covariate Balance After Matching:


Unnamed: 0,Covariate,Mean (Treated),Mean (Control),Raw Difference,Normalized Difference
0,age,26.948052,26.30131,0.646742,0.070556
1,educ,10.077922,10.454148,-0.376226,-0.142707
2,black,0.662338,0.165939,0.496399,1.161029
3,hispan,0.142857,0.222707,-0.07985,-0.206888
4,married,0.402597,0.305677,0.096921,0.20279
5,nodegree,0.662338,0.58952,0.072818,0.15027
6,re74,4126.894381,3221.359714,905.534666,0.15829
7,re75,2241.683066,1999.243082,242.439984,0.075818


In [None]:
pip uninstall lightgbm


Found existing installation: lightgbm 4.5.0
Uninstalling lightgbm-4.5.0:
  Would remove:
    /Users/pranjal/myenv/lib/python3.11/site-packages/lightgbm-4.5.0.dist-info/*
    /Users/pranjal/myenv/lib/python3.11/site-packages/lightgbm/*
Proceed (Y/n)? 