# Two-Stage Least Squares (2SLS) Benchmark: causers vs statsmodels

This notebook demonstrates that causers.two_stage_least_squares produces results
equivalent to reference packages while achieving significant speedup.

In [1]:
import sys
import platform
import warnings
import time
from typing import Callable, Dict, Any, List, Tuple, Optional

import numpy as np
import polars as pl
import pandas as pd

import causers
print(f"causers version: {causers.__version__}")

try:
    from linearmodels.iv import IV2SLS as LM_IV2SLS
    HAS_LINEARMODELS = True
    import linearmodels
    print(f"linearmodels version: {linearmodels.__version__}")
except ImportError:
    HAS_LINEARMODELS = False
    LM_IV2SLS = None
    print("linearmodels not installed")

print(f"Python {sys.version}")

causers version: 0.9.0


linearmodels version: 7.0
Python 3.11.2 (main, Apr 28 2025, 14:11:48) [GCC 12.2.0]


In [2]:
SEED = 42

def time_function(func, *args, n_iter=10, warmup=2, **kwargs):
    for _ in range(warmup):
        func(*args, **kwargs)
    times = []
    result = None
    for _ in range(n_iter):
        start = time.perf_counter()
        result = func(*args, **kwargs)
        elapsed = (time.perf_counter() - start) * 1000
        times.append(elapsed)
    return {"result": result, "median_ms": np.median(times), "min_ms": min(times), "max_ms": max(times)}

def generate_iv_data(n_obs, n_exog=5, n_instruments=3, first_stage_strength=0.5, true_effect=2.0, seed=SEED):
    np.random.seed(seed)
    z_data = {f"z{i}": np.random.randn(n_obs) for i in range(n_instruments)}
    Z = np.column_stack(list(z_data.values()))
    x_data = {f"x{i}": np.random.randn(n_obs) for i in range(n_exog)}
    X = np.column_stack(list(x_data.values()))
    u = np.random.randn(n_obs)
    first_stage_noise = np.random.randn(n_obs) * 0.5
    D = first_stage_strength * Z.sum(axis=1) / np.sqrt(n_instruments) + 0.3 * u + first_stage_noise
    Y = true_effect * D + 0.5 * X.sum(axis=1) + u
    df = pl.DataFrame({"y": Y, "d": D, **z_data, **x_data})
    return df, Y, D, X, Z

print("Helper functions defined.")

Helper functions defined.


## Parity Test

In [3]:
print("=" * 60)
print("PARITY TEST: Basic 2SLS")
print("=" * 60)

np.random.seed(SEED)
n = 5000
z = np.random.randn(n)
u = np.random.randn(n)
d = 0.5 * z + 0.3 * u + np.random.randn(n) * 0.5
y = 2.0 * d + u

df = pl.DataFrame({"y": y, "d": d, "z": z})
result = causers.two_stage_least_squares(df, "y", "d", z_cols="z", x_cols=None)

print(f"Coefficient: {result.coefficients[0]:.6f}")
print(f"SE: {result.standard_errors[0]:.6f}")
print(f"First-stage F: {result.first_stage_f[0]:.2f}")
print(f"True effect: 2.0, Estimated: {result.coefficients[0]:.4f}")

PARITY TEST: Basic 2SLS
Coefficient: 1.996326
SE: 0.028830
First-stage F: 3625.97
True effect: 2.0, Estimated: 1.9963


## Timing Benchmarks

In [4]:
IV2SLS_CONFIGS = [
    (10_000, 5, 3, 0.5, "10K obs"),
    (100_000, 5, 3, 0.5, "100K obs"),
    (1_000_000, 5, 3, 0.5, "1M obs"),
]

print("=" * 80)
print("TIMING BENCHMARKS")
print("=" * 80)

results = []
for n_obs, n_exog, n_instruments, strength, label in IV2SLS_CONFIGS:
    print(f"  {label}...", end=" ", flush=True)
    df, Y, D, X, Z = generate_iv_data(n_obs, n_exog, n_instruments, strength, seed=SEED)
    z_cols = [f"z{i}" for i in range(n_instruments)]
    x_cols = [f"x{i}" for i in range(n_exog)]
    
    def run_causers(_df=df, _z_cols=z_cols, _x_cols=x_cols):
        return causers.two_stage_least_squares(_df, "y", "d", z_cols=_z_cols, x_cols=_x_cols)
    
    timing = time_function(run_causers)
    
    meets_target = True
    if n_obs == 100_000:
        meets_target = timing["median_ms"] < 50
    elif n_obs == 1_000_000:
        meets_target = timing["median_ms"] < 500
    
    results.append({"label": label, "n_obs": n_obs, "ms": timing["median_ms"], "meets_target": meets_target})
    status = "PASS" if meets_target else "FAIL"
    print(f"{timing['median_ms']:.2f}ms [{status}]")

print("\nBenchmark complete!")

TIMING BENCHMARKS
  10K obs... 

0.87ms [PASS]
  100K obs... 

8.26ms [PASS]


  1M obs... 

95.22ms [PASS]

Benchmark complete!


## Weak Instrument Diagnostics

In [5]:
print("=" * 60)
print("WEAK INSTRUMENT DIAGNOSTICS")
print("=" * 60)

strengths = [(0.7, "Strong"), (0.3, "Moderate"), (0.15, "Weak")]

for strength, label in strengths:
    df, _, _, _, _ = generate_iv_data(n_obs=5000, n_exog=3, n_instruments=3, first_stage_strength=strength, seed=SEED)
    z_cols = [f"z{i}" for i in range(3)]
    x_cols = [f"x{i}" for i in range(3)]
    try:
        result = causers.two_stage_least_squares(df, "y", "d", z_cols=z_cols, x_cols=x_cols)
        f_stat = result.first_stage_f[0]
        status = "Strong" if f_stat >= 10 else ("Weak" if f_stat >= 4 else "Very Weak")
        print(f"  {label} (pi={strength}): F = {f_stat:.2f} [{status}]")
    except ValueError as e:
        print(f"  {label} (pi={strength}): ERROR - {e}")

WEAK INSTRUMENT DIAGNOSTICS
  Strong (pi=0.7): F = 2321.09 [Strong]
  Moderate (pi=0.3): F = 392.01 [Strong]
  Weak (pi=0.15): F = 83.93 [Strong]


In [6]:
print("=" * 60)
print("SUMMARY")
print("=" * 60)

if results:
    for r in results:
        status = "PASS" if r["meets_target"] else "FAIL"
        print(f"  {r['label']}: {r['ms']:.2f}ms [{status}]")

print("\nBenchmark complete!")

SUMMARY
  10K obs: 0.87ms [PASS]
  100K obs: 8.26ms [PASS]
  1M obs: 95.22ms [PASS]

Benchmark complete!
