# Instrumental Variables and Regression Discontinuity

In [41]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf

# Instrumental Variables

In [54]:
num = 100000
np.random.seed(0)
Z = np.random.binomial(1, 0.3, (num,))
W1 = np.random.normal(0, 1, (num,))
W2 = np.random.normal(0, 1, (num,))
epsilon_X = np.random.normal(0, 1, (num,))
X = 2 * Z + W1 - W2 + epsilon_X
epsilon_Y = np.random.normal(0, 1, (num,))
Y = 3.2 * X - 1.7 * W1 + W2 + X * W1 + epsilon_Y # true effect is 3.2
df = pd.DataFrame({"X": X, "Y": Y, "Z": Z, "W1": W1, "W2": W2})

In [57]:
df_bin = df.copy() # df[(df.W1 > 0.4) & (df.W1 < 0.6) & (df.W2 > -0.3) & (df.W2 < -0.1)]

In [58]:
y_diff = df_bin[df_bin.Z == 1].Y.mean() - df_bin[df_bin.Z == 0].Y.mean()
x_diff = df_bin[df_bin.Z == 1].X.mean() - df_bin[df_bin.Z == 0].X.mean()
y_diff / x_diff

3.1904479752541985

In [11]:
df_bin = df[(df.W1 > 0.4) & (df.W1 < 0.6) & (df.W2 > -0.3) & (df.W2 < -0.1)]
#df_bin = df.copy()
(df_bin[df_bin.Z == 1].Y.mean() - df_bin[df_bin.Z == 0].Y.mean()) / (df_bin[df_bin.Z == 1].X.mean() - df_bin[df_bin.Z == 0].X.mean())

3.66793187803789

In [9]:
num = 100000
np.random.seed(0)
Z = np.random.binomial(1, 0.3, (num,))
W1 = np.random.normal(0, 1, (num,))
W2 = np.random.normal(0, 1, (num,))
epsilon_X = np.random.normal(0, 1, (num,))
X = 2 * Z + W1 - W2 + epsilon_X
epsilon_Y = np.random.normal(0, 1, (num,))
Y = 3.2 * X - 1.7 * W1 + W1 * X + epsilon_Y # true effect is 3.2
df = pd.DataFrame({"X": X, "Y": Y, "Z": Z, "W1": W1, "W2": W2})

# Y average at Z = 1 minus Z = 0
Y_diff = df[df.Z == 1].Y.mean() - df[df.Z == 0].Y.mean()
X_diff = df[df.Z == 1].X.mean() - df[df.Z == 0].X.mean()
Y_diff / X_diff

3.1949660524467727

### Does it work when Z influences Y directly?

In [59]:
Y = 3.2 * X - 1.7 * W1 - 2 * Z + epsilon_Y # true effect is 3.2
df = pd.DataFrame({"X": X, "Y": Y, "Z": Z, "W1": W1, "W2": W2})

# Y average at Z = 1 minus Z = 0
Y_diff = df[df.Z == 1].Y.mean() - df[df.Z == 0].Y.mean()
X_diff = df[df.Z == 1].X.mean() - df[df.Z == 0].X.mean()
Y_diff / X_diff

2.197356441250059

### Does it work when Z influences W?

In [60]:
Z = np.random.binomial(1, 0.3, (num,))
W1 = 2 * Z + np.random.normal(0, 1, (num,))
W2 = np.random.normal(0, 1, (num,))
epsilon_X = np.random.normal(0, 1, (num,))
X = 2 * Z + W1 - W2 + epsilon_X
epsilon_Y = np.random.normal(0, 1, (num,))
Y = 3.2 * X - 1.7 * W1 + epsilon_Y # true effect is 3.2
df = pd.DataFrame({"X": X, "Y": Y, "Z": Z, "W1": W1, "W2": W2})

# Y average at Z = 1 minus Z = 0
Y_diff = df[df.Z == 1].Y.mean() - df[df.Z == 0].Y.mean()
X_diff = df[df.Z == 1].X.mean() - df[df.Z == 0].X.mean()
Y_diff / X_diff

2.349474884470536

### Regression Discontinuity

In [61]:
np.random.seed(0)
num = 100000
# Note: X_above_cutoff and X_running are both the treatment, sort of.
# But X_above_cutoff is the real treatment in that its coefficient is the effect.
X_running = np.random.uniform(-1, 1, num)
X_above_cutoff = 1 * (X_running >= 0)
epsilon_Y = np.random.normal(0, 1, num)
Y = 2.8 * X_above_cutoff + 0.5 * X_running + epsilon_Y # true effect is 2.8

# Create DataFrame
df = pd.DataFrame({'running': X_running, 'above_cutoff': X_above_cutoff, 'Y': Y})

model = smf.ols('Y ~ running + above_cutoff', data=df).fit()
print(model.params)

Intercept       0.018988
running         0.516870
above_cutoff    2.782920
dtype: float64


### What if there's a confounder?

In [62]:
np.random.seed(0)
num = 1000000
# Note: X_above_cutoff and X_running are both the treatment, sort of.
# But X_above_cutoff is the real treatment in that its coefficient is the effect.
W = np.random.uniform(-1, 1, num)
X_running = np.random.uniform(-1, 1, num) + 2 * W
X_above_cutoff = 1 * (X_running >= 0)
Y = 1.2 * X_above_cutoff + 0.5 * X_running - 2 * W + np.random.normal(0, 1, num)  # true effect is 2.8

# Create DataFrame
df = pd.DataFrame({'running': X_running, 'above_cutoff': X_above_cutoff, 'W': W, 'Y': Y})

model = smf.ols('Y ~ running + above_cutoff', data=df).fit()
print(model.params)

Intercept       0.167738
running        -0.190323
above_cutoff    0.861740
dtype: float64


### Solution: Just go slightly above and below the cutoff

In [65]:
np.random.seed(0)
num = 10000000
# Note: X_above_cutoff and X_running are both the treatment, sort of.
# But X_above_cutoff is the real treatment in that its coefficient is the effect.
W = np.random.uniform(-1, 1, num)
X_running = np.random.uniform(-1, 1, num) + 2 * W
X_above_cutoff = 1 * (X_running >= 0)
Y = 1.2 * X_above_cutoff + 0.5 * X_running - 2 * W + np.random.normal(0, 1, num)  # true effect is 2.8

# Create DataFrame
df = pd.DataFrame({'running': X_running, 'above_cutoff': X_above_cutoff, 'Y': Y})

df_limited = df[(df.running > -0.1) & (df.running < 0.1)]

model = smf.ols('Y ~ running + above_cutoff', data=df_limited).fit()
print(model.params)

Intercept       0.006807
running        -0.393145
above_cutoff    1.184976
dtype: float64
