# OSRCT Benchmark - Quick Start Guide

This notebook demonstrates how to:
1. Load confounded datasets
2. Access ground truth ATEs
3. Evaluate a causal inference method
4. Compare multiple methods

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

# Set paths (adjust if needed)
BENCHMARK_DIR = Path('..')  # Parent directory
DATASETS_DIR = BENCHMARK_DIR / 'confounded_datasets' / 'by_study'
GROUND_TRUTH = BENCHMARK_DIR / 'ground_truth' / 'rct_ates.csv'

print(f"Datasets directory: {DATASETS_DIR}")

## 1. Load a Confounded Dataset

In [None]:
# Load a specific dataset
study = 'anchoring1'
pattern = 'age'
beta = 0.5

dataset_path = DATASETS_DIR / study / f'{pattern}_beta{beta}_seed42.csv'
data = pd.read_csv(dataset_path)

print(f"Dataset: {study}, Pattern: {pattern}, Beta: {beta}")
print(f"Shape: {data.shape}")
print(f"\nColumns: {list(data.columns)}")
data.head()

In [None]:
# Examine treatment distribution
print("Treatment Distribution:")
print(data['iv'].value_counts())
print(f"\nTreatment rate: {data['iv'].mean():.3f}")

## 2. Load Ground Truth ATEs

In [None]:
# Load ground truth
ground_truth = pd.read_csv(GROUND_TRUTH)
print(f"Ground truth for {len(ground_truth)} studies:\n")
ground_truth[['study', 'n_total', 'ate', 'ate_se']].head(10)

In [None]:
# Get true ATE for our study
true_ate = ground_truth[ground_truth['study'] == study]['ate'].values[0]
true_se = ground_truth[ground_truth['study'] == study]['ate_se'].values[0]

print(f"True ATE for {study}: {true_ate:.2f} (SE: {true_se:.2f})")

## 3. Implement Simple Causal Methods

In [None]:
from sklearn.linear_model import LogisticRegression, LinearRegression

def naive_estimator(data, treatment_col='iv', outcome_col='dv'):
    """Simple difference in means (biased under confounding)."""
    treated = data[data[treatment_col] == 1][outcome_col]
    control = data[data[treatment_col] == 0][outcome_col]
    
    ate = treated.mean() - control.mean()
    se = np.sqrt(treated.var()/len(treated) + control.var()/len(control))
    
    return {
        'method': 'naive',
        'ate': ate,
        'se': se,
        'ci_lower': ate - 1.96 * se,
        'ci_upper': ate + 1.96 * se
    }


def ipw_estimator(data, treatment_col='iv', outcome_col='dv', covariates=['resp_age']):
    """Inverse Probability Weighting estimator."""
    X = data[covariates].values
    T = data[treatment_col].values
    Y = data[outcome_col].values
    
    # Fit propensity score model
    ps_model = LogisticRegression(max_iter=1000)
    ps_model.fit(X, T)
    e = np.clip(ps_model.predict_proba(X)[:, 1], 0.01, 0.99)
    
    # IPW estimator
    weights_1 = T / e
    weights_0 = (1 - T) / (1 - e)
    
    ate_1 = np.sum(Y * weights_1) / np.sum(weights_1)
    ate_0 = np.sum(Y * weights_0) / np.sum(weights_0)
    ate = ate_1 - ate_0
    
    # Bootstrap SE (simplified)
    se = np.std(Y[T==1])/np.sqrt(np.sum(T)) + np.std(Y[T==0])/np.sqrt(np.sum(1-T))
    
    return {
        'method': 'ipw',
        'ate': ate,
        'se': se,
        'ci_lower': ate - 1.96 * se,
        'ci_upper': ate + 1.96 * se
    }


def outcome_regression(data, treatment_col='iv', outcome_col='dv', covariates=['resp_age']):
    """Outcome regression estimator."""
    X = data[covariates].values
    T = data[treatment_col].values.reshape(-1, 1)
    Y = data[outcome_col].values
    
    # Fit outcome model
    X_full = np.hstack([T, X])
    model = LinearRegression()
    model.fit(X_full, Y)
    
    ate = model.coef_[0]  # Coefficient on treatment
    
    # Simplified SE
    residuals = Y - model.predict(X_full)
    se = np.std(residuals) / np.sqrt(len(Y))
    
    return {
        'method': 'outcome_regression',
        'ate': ate,
        'se': se,
        'ci_lower': ate - 1.96 * se,
        'ci_upper': ate + 1.96 * se
    }

## 4. Evaluate Methods

In [None]:
# Run all methods
results = []

# Naive
naive_result = naive_estimator(data)
naive_result['bias'] = naive_result['ate'] - true_ate
results.append(naive_result)

# IPW
ipw_result = ipw_estimator(data, covariates=['resp_age'])
ipw_result['bias'] = ipw_result['ate'] - true_ate
results.append(ipw_result)

# Outcome Regression
or_result = outcome_regression(data, covariates=['resp_age'])
or_result['bias'] = or_result['ate'] - true_ate
results.append(or_result)

results_df = pd.DataFrame(results)
print(f"True ATE: {true_ate:.2f}\n")
results_df[['method', 'ate', 'se', 'bias']]

In [None]:
# Visualize results
fig, ax = plt.subplots(figsize=(8, 4))

methods = results_df['method'].values
ates = results_df['ate'].values
errors = results_df['se'].values * 1.96

y_pos = np.arange(len(methods))

ax.barh(y_pos, ates, xerr=errors, capsize=5, color=['#E74C3C', '#3498DB', '#2ECC71'])
ax.axvline(x=true_ate, color='black', linestyle='--', linewidth=2, label=f'True ATE = {true_ate:.0f}')

ax.set_yticks(y_pos)
ax.set_yticklabels([m.replace('_', ' ').title() for m in methods])
ax.set_xlabel('Estimated ATE')
ax.set_title(f'Method Comparison: {study} (β={beta})')
ax.legend()

plt.tight_layout()
plt.show()

## 5. Evaluate Across Multiple Datasets

In [None]:
# Evaluate across different confounding strengths
beta_values = [0.1, 0.5, 1.0, 2.0]
all_results = []

for beta in beta_values:
    dataset_path = DATASETS_DIR / study / f'{pattern}_beta{beta}_seed42.csv'
    if not dataset_path.exists():
        continue
        
    data = pd.read_csv(dataset_path)
    
    for method_func, method_name in [
        (naive_estimator, 'naive'),
        (lambda d: ipw_estimator(d, covariates=['resp_age']), 'ipw'),
        (lambda d: outcome_regression(d, covariates=['resp_age']), 'or')
    ]:
        try:
            result = method_func(data)
            result['beta'] = beta
            result['bias'] = result['ate'] - true_ate
            result['abs_bias'] = abs(result['bias'])
            all_results.append(result)
        except:
            pass

all_results_df = pd.DataFrame(all_results)
print(f"Evaluated {len(all_results_df)} method-dataset combinations")
all_results_df.head(10)

In [None]:
# Plot bias by confounding strength
fig, ax = plt.subplots(figsize=(8, 5))

for method in all_results_df['method'].unique():
    method_data = all_results_df[all_results_df['method'] == method]
    ax.plot(method_data['beta'], method_data['abs_bias'], 
            marker='o', label=method.replace('_', ' ').title(), linewidth=2)

ax.set_xlabel('Confounding Strength (β)')
ax.set_ylabel('Absolute Bias')
ax.set_title(f'Bias vs Confounding Strength: {study}')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 6. Load Pre-computed Results

In [None]:
# Load pre-computed method evaluation results
results_dir = BENCHMARK_DIR / 'analysis_results' / 'method_evaluation'

if results_dir.exists():
    perf_by_method = pd.read_csv(results_dir / 'performance_by_method.csv')
    print("Pre-computed performance by method:")
    display(perf_by_method)
else:
    print("Pre-computed results not found")

## Next Steps

1. **Explore more datasets**: Try different studies, patterns, and beta values
2. **Implement your method**: Follow the function signature and return format
3. **Submit to leaderboard**: Use the method submission issue template
4. **Read documentation**: See README.md for full details