<a href="https://colab.research.google.com/github/sebarom06/econ3916-statsml/blob/main/Lab09/Lab09.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors
from scipy import stats

# Load your dataset here
df = pd.read_csv('lalonde.csv')

# Naive Comparison
naive_diff = df[df.treat==1]['re78'].mean() - df[df.treat==0]['re78'].mean()
print(f"Naive Difference in Means: ${naive_diff:,.2f}")
# Expected Result: -$635.03

# Define covariates
X = df[['age', 'educ', 'black', 'married', 'nodegree', 're74', 're75']]
y = df['treat']

# Fit Propensity Model
logit = LogisticRegression(solver='liblinear')
logit.fit(X, y)

# Generate Scores
df['pscore'] = logit.predict_proba(X)[:, 1]

# Separate groups
treated = df[df.treat==1]
control = df[df.treat==0]

# Fit NN on Control scores
nbrs = NearestNeighbors(n_neighbors=1, metric='euclidean').fit(control[['pscore']])

# Find matches for Treated scores
distances, indices = nbrs.kneighbors(treated[['pscore']])
matched_control = control.iloc[indices.flatten()]

# Construct Matched DataFrame
matched_df = pd.concat([treated, matched_control])

# T-test on raw data
diff = treated['re78'].mean() - control['re78'].mean()
t_stat, p_val = stats.ttest_ind(treated['re78'], control['re78'])

print(f"\nRaw Effect (Difference):  ${diff:,.2f}")
print(f"P-value: {p_val:.4f}")

# Isolate the matched outcomes
matched_treated = matched_df[matched_df.treat==1]['re78']
matched_control_out = matched_df[matched_df.treat==0]['re78']

# Estimate the causal effect (T-test on matched data)
matched_diff = matched_treated.mean() - matched_control_out.mean()
t_stat, p_val = stats.ttest_ind(matched_treated, matched_control_out)

print(f"\nRecovered Effect (Matched Difference): ${matched_diff:,.2f}")
print(f"P-value: {p_val:.4f}")

Naive Difference in Means: $-635.03

Raw Effect (Difference):  $-635.03
P-value: 0.3342

Recovered Effect (Matched Difference): $160.18
P-value: 0.8363
