In [29]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
np.random.seed(42)

In [2]:
def simulate(A=1, B=1, C=10, D=1000):
  W = np.random.normal(0,1,D)
  X = W+np.random.normal(0,B,D)
  Y = A*X-W+np.random.normal(0,C,D)
  return Y, X, W

In [10]:
# run the sim
Y, X, W = simulate(A=1, B=1, C=10, D=1000)

# build a DataFrame, calling W “Z”
df = pd.DataFrame({
    'X': X,
    'Y': Y,
    'W': W
})

df.head()

Unnamed: 0,X,Y,W
0,0.797688,5.06231,0.758725
1,-1.03701,-17.028224,-2.127096
2,0.979631,-6.554048,0.376445
3,-1.793528,4.465855,-0.891174
4,-0.239551,0.31698,0.299636


## Question 1
Which of the following is closest to the probability of detecting a nonzero effect of $X$ on $Y$ (the t-value of $X$ is greater in absolute value than about 1.96) given A = 1, B = 1, C = 10, D = 1000? Include W in the regression.

In [None]:
# run the sim
Y, X, W = simulate(A=1, B=1, C=10, D=1000)

# build a DataFrame, calling W “Z”
df = pd.DataFrame({
    'X': X,
    'Y': Y,
    'W': W
})

df.head()

In [12]:
model = sm.OLS.from_formula('Y ~ X + W', data=df)
results = model.fit()
results.summary()

0,1,2,3
Dep. Variable:,Y,R-squared:,0.013
Model:,OLS,Adj. R-squared:,0.011
Method:,Least Squares,F-statistic:,6.605
Date:,"Fri, 18 Jul 2025",Prob (F-statistic):,0.00141
Time:,20:51:05,Log-Likelihood:,-3731.9
No. Observations:,1000,AIC:,7470.0
Df Residuals:,997,BIC:,7484.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.5483,0.320,-1.712,0.087,-1.177,0.080
X,1.1371,0.316,3.593,0.000,0.516,1.758
W,-0.9515,0.448,-2.122,0.034,-1.832,-0.071

0,1,2,3
Omnibus:,0.123,Durbin-Watson:,1.991
Prob(Omnibus):,0.94,Jarque-Bera (JB):,0.066
Skew:,-0.013,Prob(JB):,0.968
Kurtosis:,3.03,Cond. No.,2.59


In [35]:
def estimate_power(nsim=1000, alpha=0.05):
    """Estimate the probability of |t| > 1.96 for X across nsim simulations."""
    detections = []
    for _ in range(nsim):
        Y, X, W = simulate(A=1, B=1, C=10, D=1000)
        df = pd.DataFrame({'X': X, 'W': W, 'Y': Y})
        
        # fit Y ~ X + W
        model = sm.OLS(df['Y'], sm.add_constant(df[['X', 'W']])).fit()
        t_x = model.tvalues['X']
        
        # record whether |t| exceeds the 1.96 threshold
        detections.append(abs(t_x) > 1.96)
        
    return np.mean(detections)

# Run the power simulation
power_estimate = estimate_power(nsim=2000)
print(f"Empirical probability of detecting X (power): {power_estimate:.3f}")

Empirical probability of detecting X (power): 0.883


## Question 2
Which of the following is closest to the skew of the estimate in that case? (You can compute this using scipy.)

In [34]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from scipy.stats import skew

# Re‑use your simulate function
def simulate(A=1, B=1, C=10, D=1000):
    W = np.random.normal(0, 1, D)
    X = W + np.random.normal(0, B, D)
    Y = A * X - W + np.random.normal(0, C, D)
    return Y, X, W

def estimate_skew(nsim=2000, alpha=0.05):
    """Estimate the skewness of the distribution of the X coefficient estimates."""
    coefs = []
    for _ in range(nsim):
        Y, X, W = simulate(A=1, B=1, C=10, D=1000)
        df = pd.DataFrame({'X': X, 'W': W, 'Y': Y})
        
        # fit Y ~ X + W
        model = sm.OLS(df['Y'], sm.add_constant(df[['X', 'W']])).fit()
        coefs.append(model.params['X'])
        
    return skew(coefs)

# Run the skewness simulation
skew_estimate = estimate_skew(nsim=2000)
print(f"Empirical skew of the X coefficient estimates: {skew_estimate:.3f}")

Empirical skew of the X coefficient estimates: 0.049


## Question 3
With A = 1, C = 10, D = 1,000, what value of B is needed to detect that the Data Generating Process (DGP) has a nonzero coefficient for X about 50% of the time? (Choose the closest value.)

In [18]:
from scipy.stats import t

def power_for_B(B, nsim=1000, alpha=0.05):
    tcrit = t.ppf(1 - alpha/2, df=1000-3)   # approx df=D–#params
    detects = []
    for _ in range(nsim):
        Y, X, W = simulate(A=1, B=B, C=10, D=1000)
        df = pd.DataFrame({'Y':Y,'X':X,'W':W})
        m = sm.OLS(df.Y, sm.add_constant(df[['X','W']])).fit()
        detects.append(abs(m.tvalues['X']) > tcrit)
    return np.mean(detects)

# sweep over Bs to find ~50%:
for B in [0.2, 0.6, 1.8, 5.4]:
    print(B, power_for_B(B, nsim=10000))

0.2 0.0966
0.6 0.4758
1.8 0.9998
5.4 1.0


## Question 4
With B = 1, C = 10, D = 100 (note the different value of D), what value of A is needed to detect that the DGP has a nonzero coefficient for X about 50% of the time? (Choose the closest value.) 

In [20]:
def power_for_A(A, nsim=1000, alpha=0.05):
    tcrit = t.ppf(1 - alpha/2, df=1000-3)   # approx df=D–#params
    detects = []
    for _ in range(nsim):
        Y, X, W = simulate(A=A, B=1, C=10, D=100)
        df = pd.DataFrame({'Y':Y,'X':X,'W':W})
        m = sm.OLS(df.Y, sm.add_constant(df[['X','W']])).fit()
        detects.append(abs(m.tvalues['X']) > tcrit)
    return np.mean(detects)

# sweep over Bs to find ~50%:
for A in [0.5, 1.0, 2.0, 4.0]:
    print(A, power_for_A(A, nsim=10000))

0.5 0.0805
1.0 0.1726
2.0 0.5018
4.0 0.9719


In [21]:
def simulate(A=1, B=1, C=10, D=100):
    W = np.random.normal(0, 1, D)
    X = W + np.random.normal(0, B, D)
    Y = A * X - W + np.random.normal(0, C, D)
    return Y, X, W

def power_for_A(A, nsim=2000, alpha=0.05, D=100):
    df_resid = D - 3                      # 100 obs − 3 params
    tcrit    = t.ppf(1 - alpha/2, df=df_resid)
    detects  = []
    
    for _ in range(nsim):
        Y, X, W = simulate(A=A, B=1, C=10, D=D)
        df = pd.DataFrame({'Y': Y, 'X': X, 'W': W})
        m  = sm.OLS(df.Y, sm.add_constant(df[['X','W']])).fit()
        detects.append(abs(m.tvalues['X']) > tcrit)
    
    return np.mean(detects)

# Try a grid of A’s and see which gives ~50% power
for A in [0.5, 1.0, 2.0, 4.0]:
    p = power_for_A(A, nsim=5000, alpha=0.05, D=100)
    print(f"A = {A:4.2f} → power ≈ {p:.3f}")

A = 0.50 → power ≈ 0.082
A = 1.00 → power ≈ 0.165
A = 2.00 → power ≈ 0.493
A = 4.00 → power ≈ 0.972
