# Instrumental Variables and Regression Discontinuity

In [2]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf

# Instrumental Variables

In [None]:
num = 100000 # number of samples
np.random.seed(0) # set seed for reproducibility
Z = np.random.binomial(1, 0.3, (num,)) # random binary variable with 30% probability of being 1
W1 = np.random.normal(0, 1, (num,)) # random normal variable between -1 and 1
W2 = np.random.normal(0, 1, (num,))  # random normal variable between -1 and 1
epsilon_X = np.random.normal(0, 1, (num,)) # noise term 
X = 2 * Z + W1 - W2 + epsilon_X # set x as a function of Z, W1, and W2 and noise
epsilon_Y = np.random.normal(0, 1, (num,))  # noise term for y 
Y = 3.2 * X - 1.7 * W1 + W2 + X * W1 + epsilon_Y # true effect is 3.2
df = pd.DataFrame({"X": X, "Y": Y, "Z": Z, "W1": W1, "W2": W2}) # create a DataFrame of our terms

In [6]:
df

Unnamed: 0,X,Y,Z,W1,W2
0,2.369296,8.865061,0,0.911503,-0.668431
1,4.616810,14.060842,1,0.079538,-1.373565
2,1.192770,3.215649,0,-0.949305,-1.502755
3,1.458397,4.602773,0,0.005700,-0.138553
4,2.214544,8.678365,0,1.831838,-0.687567
...,...,...,...,...,...
99995,2.750540,10.106597,1,0.489504,-0.413032
99996,0.313975,3.426943,0,0.660637,1.784563
99997,2.657616,5.855775,1,-0.003210,-1.693409
99998,1.100564,4.144663,0,0.630569,0.187785


In [7]:
df_bin = df.copy() # df[(df.W1 > 0.4) & (df.W1 < 0.6) & (df.W2 > -0.3) & (df.W2 < -0.1)]

In [8]:
y_diff = df_bin[df_bin.Z == 1].Y.mean() - df_bin[df_bin.Z == 0].Y.mean() # calculate the difference in means for Y based on Z
x_diff = df_bin[df_bin.Z == 1].X.mean() - df_bin[df_bin.Z == 0].X.mean()  # calculate the difference in means for X based on Z
y_diff / x_diff # calculate the average treatment effect (ATE) of Z on Y based on X

3.1904479752541985

In [None]:
df_bin = df[(df.W2 > 0.1) & (df.W1 < 0.6) & (df.W2 > -0.3) & (df.W2 < -0.1)] # filter the DataFrame based on W1 and W2 values (between 0.4 and 0.6 for W1, and between -0.3 and -0.1 for W2)
#df_bin = df.copy()
(df_bin[df_bin.Z == 1].Y.mean() - df_bin[df_bin.Z == 0].Y.mean()) / (df_bin[df_bin.Z == 1].X.mean() - df_bin[df_bin.Z == 0].X.mean()) # calculate the average treatment effect (ATE) of Z on Y based on X after filtering

3.6661641514440033

In [11]:
df_bin

Unnamed: 0,X,Y,Z,W1,W2
69,-1.054007,-6.727640,0,0.446364,-0.215842
273,3.847537,12.321161,1,0.577957,-0.181362
502,0.176181,0.646995,0,0.476118,-0.153419
808,2.515425,6.321929,1,0.419022,-0.207526
943,1.570188,3.231263,1,0.500602,-0.112316
...,...,...,...,...,...
98809,3.522269,9.141816,1,0.557161,-0.162855
99124,3.398906,11.276848,1,0.452462,-0.262125
99243,2.278314,6.098921,1,0.493834,-0.256372
99280,2.494287,6.964502,0,0.581424,-0.132433


In [9]:
num = 100000
np.random.seed(0)
Z = np.random.binomial(1, 0.3, (num,))
W1 = np.random.normal(0, 1, (num,))
W2 = np.random.normal(0, 1, (num,))
epsilon_X = np.random.normal(0, 1, (num,))
X = 2 * Z + W1 - W2 + epsilon_X
epsilon_Y = np.random.normal(0, 1, (num,))
Y = 3.2 * X - 1.7 * W1 + W1 * X + epsilon_Y # true effect is 3.2
df = pd.DataFrame({"X": X, "Y": Y, "Z": Z, "W1": W1, "W2": W2})

# Y average at Z = 1 minus Z = 0
Y_diff = df[df.Z == 1].Y.mean() - df[df.Z == 0].Y.mean()
X_diff = df[df.Z == 1].X.mean() - df[df.Z == 0].X.mean()
Y_diff / X_diff

3.1949660524467727

### Does it work when Z influences Y directly?

In [59]:
Y = 3.2 * X - 1.7 * W1 - 2 * Z + epsilon_Y # true effect is 3.2
df = pd.DataFrame({"X": X, "Y": Y, "Z": Z, "W1": W1, "W2": W2})

# Y average at Z = 1 minus Z = 0
Y_diff = df[df.Z == 1].Y.mean() - df[df.Z == 0].Y.mean()
X_diff = df[df.Z == 1].X.mean() - df[df.Z == 0].X.mean()
Y_diff / X_diff

2.197356441250059

### Does it work when Z influences W?

In [60]:
Z = np.random.binomial(1, 0.3, (num,))
W1 = 2 * Z + np.random.normal(0, 1, (num,))
W2 = np.random.normal(0, 1, (num,))
epsilon_X = np.random.normal(0, 1, (num,))
X = 2 * Z + W1 - W2 + epsilon_X
epsilon_Y = np.random.normal(0, 1, (num,))
Y = 3.2 * X - 1.7 * W1 + epsilon_Y # true effect is 3.2
df = pd.DataFrame({"X": X, "Y": Y, "Z": Z, "W1": W1, "W2": W2})

# Y average at Z = 1 minus Z = 0
Y_diff = df[df.Z == 1].Y.mean() - df[df.Z == 0].Y.mean()
X_diff = df[df.Z == 1].X.mean() - df[df.Z == 0].X.mean()
Y_diff / X_diff

2.349474884470536

### Regression Discontinuity

In [12]:
np.random.seed(0)
num = 100000
# Note: X_above_cutoff and X_running are both the treatment, sort of.
# But X_above_cutoff is the real treatment in that its coefficient is the effect.
X_running = np.random.uniform(-1, 1, num)
X_above_cutoff = 1 * (X_running >= 0)
epsilon_Y = np.random.normal(0, 1, num)
Y = 2.8 * X_above_cutoff + 0.5 * X_running + epsilon_Y # true effect is 2.8

# Create DataFrame
df = pd.DataFrame({'running': X_running, 'above_cutoff': X_above_cutoff, 'Y': Y})

model = smf.ols('Y ~ running + above_cutoff', data=df).fit()
print(model.params)

Intercept       0.018988
running         0.516870
above_cutoff    2.782920
dtype: float64


In [13]:
df

Unnamed: 0,running,above_cutoff,Y
0,0.097627,1,3.760316
1,0.430379,1,3.094727
2,0.205527,1,1.953459
3,0.089766,1,2.850583
4,-0.152690,0,1.755493
...,...,...,...
99995,0.451842,1,3.515425
99996,0.015041,1,3.468157
99997,0.620266,1,3.106923
99998,0.102065,1,3.481601


### What if there's a confounder?

In [62]:
np.random.seed(0)
num = 1000000
# Note: X_above_cutoff and X_running are both the treatment, sort of.
# But X_above_cutoff is the real treatment in that its coefficient is the effect.
W = np.random.uniform(-1, 1, num)
X_running = np.random.uniform(-1, 1, num) + 2 * W
X_above_cutoff = 1 * (X_running >= 0)
Y = 1.2 * X_above_cutoff + 0.5 * X_running - 2 * W + np.random.normal(0, 1, num)  # true effect is 2.8

# Create DataFrame
df = pd.DataFrame({'running': X_running, 'above_cutoff': X_above_cutoff, 'W': W, 'Y': Y})

model = smf.ols('Y ~ running + above_cutoff', data=df).fit()
print(model.params)

Intercept       0.167738
running        -0.190323
above_cutoff    0.861740
dtype: float64


### Solution: Just go slightly above and below the cutoff

In [65]:
np.random.seed(0)
num = 10000000
# Note: X_above_cutoff and X_running are both the treatment, sort of.
# But X_above_cutoff is the real treatment in that its coefficient is the effect.
W = np.random.uniform(-1, 1, num)
X_running = np.random.uniform(-1, 1, num) + 2 * W
X_above_cutoff = 1 * (X_running >= 0)
Y = 1.2 * X_above_cutoff + 0.5 * X_running - 2 * W + np.random.normal(0, 1, num)  # true effect is 2.8

# Create DataFrame
df = pd.DataFrame({'running': X_running, 'above_cutoff': X_above_cutoff, 'Y': Y})

df_limited = df[(df.running > -0.1) & (df.running < 0.1)]

model = smf.ols('Y ~ running + above_cutoff', data=df_limited).fit()
print(model.params)

Intercept       0.006807
running        -0.393145
above_cutoff    1.184976
dtype: float64


In [14]:
import pandas as pd
from sklearn.linear_model import LinearRegression

# Load each CSV
df_a = pd.read_csv('homework_4.2.a.csv')   # has columns X, Y
df_b = pd.read_csv('homework_4.2.b.csv')   # has columns X2, Y2

# Define cutoff
cutoff = 80

def slope_intercept(df, score_col, outcome_col, cutoff):
    # split
    pre  = df[df[score_col] <  cutoff]
    post = df[df[score_col] >= cutoff]
    # fit on pre-cutoff
    lm = LinearRegression().fit(pre[[score_col]], pre[outcome_col])
    slope_pre, int_pre = lm.coef_[0], lm.intercept_
    # fit on post-cutoff
    lm = LinearRegression().fit(post[[score_col]], post[outcome_col])
    slope_post, int_post = lm.coef_[0], lm.intercept_
    return slope_pre, int_pre, slope_post, int_post

# Compute for (X, Y)
s_pre, i_pre, s_post, i_post = slope_intercept(df_a, 'X',  'Y',  cutoff)
print(f"(X, Y) →  pre-cutoff slope = {s_pre:.6f}, intercept = {i_pre:.6f}")
print(f"          post-cutoff slope = {s_post:.6f}, intercept = {i_post:.6f}\n")

# Compute for (X2, Y2)
s_pre2, i_pre2, s_post2, i_post2 = slope_intercept(df_b, 'X2', 'Y2', cutoff)
print(f"(X2, Y2) → pre-cutoff slope = {s_pre2:.6f}, intercept = {i_pre2:.6f}")
print(f"           post-cutoff slope = {s_post2:.6f}, intercept = {i_post2:.6f}")

FileNotFoundError: [Errno 2] No such file or directory: 'homework_4.2.a.csv'