In [8]:
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import pandas as pd
from scipy.optimize import curve_fit

# Coding Quiz 10

## Question 1:
Use the data in homework_10.1.csv and find the fixed effect (the constant term in the regression) for each time (0 through 11). Which of these describes the pattern:
- Option A - The effects vary randomly.
- Option B - The effects decrease from month 0 to about 5 or 6 but then eventually increase again.
- Option C - The effects increase from month 0 to about 5 or 6 but then eventually decrease again.
- Option D - The effects gradually increase from month 0 to 11. 

In [3]:
df = pd.read_csv('homework_10.1.csv', index_col=0)
df

Unnamed: 0,city,time,X,y
0,0,0,0.144044,7.552716
1,0,1,1.454274,10.077829
2,0,2,0.761038,12.372731
3,0,3,0.121675,11.489263
4,0,4,0.443863,13.104833
...,...,...,...,...
355,9,7,0.160928,6.829939
356,9,8,-0.190653,5.756958
357,9,9,-0.394850,6.793439
358,9,10,-0.267734,5.386801


In [9]:
# Estimate fixed effects for each time period using dummy variables
import statsmodels.formula.api as smf

# Fit OLS with time dummies only (no city effects, just time fixed effects)
fe_time = smf.ols('y ~ C(time)', data=df).fit()

# Extract the fixed effects (intercept and time dummies)
fixed_effects = fe_time.params

# The constant is for time=0, and each C(time)[T.x] is the effect relative to time=0
fixed_effects_per_time = {t: fixed_effects['Intercept'] + fixed_effects.get(f'C(time)[T.{t}]', 0) for t in range(12)}
fixed_effects_per_time

{0: 2.560533928742988,
 1: 4.812815242679989,
 2: 6.364950616423025,
 3: 7.179399191897816,
 4: 8.053076440867905,
 5: 9.122902599339945,
 6: 8.750663383651567,
 7: 8.316077371753956,
 8: 8.075792834431955,
 9: 7.815888085291215,
 10: 5.521174535100574,
 11: 4.492187939253872}

In [11]:
# Estimate fixed effects for each city using dummy variables
fe_city = smf.ols('y ~ C(city)', data=df).fit()

# Extract the fixed effects (intercept and city dummies)
city_effects = fe_city.params

# The constant is for city=0, and each C(city)[T.x] is the effect relative to city=0
fixed_effects_per_city = {c: city_effects['Intercept'] + city_effects.get(f'C(city)[T.{c}]', 0) for c in range(10)}
fixed_effects_per_city

{0: 10.871199370503561,
 1: 5.315070941478359,
 2: 7.929660438406206,
 3: 12.874988521722786,
 4: 10.636539431330007,
 5: -0.14797355505949028,
 6: 7.979321861682254,
 7: 3.1923377883787696,
 8: 3.3690644903821134,
 9: 5.534342519037807}

In [13]:
num = 1000
X = np.clip(np.random.normal(3, 1, (num,)), 0.01, 100)
Z = np.clip(np.random.normal(3, 1, (num,)), 0.01, 100)
Y = np.log(X + Z) + np.random.normal(0, 1, (num,)) 
df = pd.DataFrame({'X': X, 'Z': Z, 'Y': Y})

In [14]:
df

Unnamed: 0,X,Z,Y
0,3.080303,3.663465,2.330887
1,3.244804,3.164956,2.070197
2,3.098013,2.005186,2.625644
3,2.659802,3.910397,0.636407
4,2.000600,3.912994,3.555588
...,...,...,...
995,2.496458,1.965517,0.465655
996,4.808224,4.196177,1.739227
997,3.853521,1.874653,0.783601
998,2.710847,1.837531,1.545752


## Question 3
For the following data, model np.exp(Y) as a function of X and Z. 

`num = 10000`

`X = np.clip(np.random.normal(3, 1, (num,)), 0.01, 100)`

`Z = np.clip(np.random.normal(3, 1, (num,)), 0.01, 100)`

`Y = np.log(X + Z) + np.random.normal(0, 1, (num,))`

With enough data, the coefficients of X and Y are closest to:

In [15]:
import numpy as np
from sklearn.linear_model import LinearRegression

# 1) simulate one dataset
n = 10_000
X = np.clip(np.random.normal(3,1,n), 0.01, 100)
Z = np.clip(np.random.normal(3,1,n), 0.01, 100)
Y = np.log(X + Z) + np.random.normal(0,1,n)

# 2) form the response
R = np.exp(Y)

# 3) fit the linear model
model = LinearRegression().fit(np.column_stack((X,Z)), R)
beta_X, beta_Z = model.coef_

In [16]:
M = 500              # number of Monte Carlo runs
betas = np.zeros((M, 2))
for m in range(M):
    # simulate X, Z, Y, form R, fit model ...
    betas[m] = LinearRegression().fit(
        np.column_stack((X,Z)), np.exp(Y)
    ).coef_
# then examine betas[:,0] and betas[:,1]
avg_beta_X = betas[:,0].mean()
avg_beta_Z = betas[:,1].mean()

In [17]:
print(f"Average beta_X: {avg_beta_X}, Average beta_Z: {avg_beta_Z}")

Average beta_X: 1.6823305043023216, Average beta_Z: 1.7489755757710623


In [28]:
import numpy as np
import statsmodels.api as sm

# simulate one dataset
num = 10_000
Z = np.random.normal(0, 1, num)
X = Z + np.random.normal(0, 1, num)
# note the heteroskedastic noise: variance = X**2
Y = 1.5*X + 2.3*Z + np.random.normal(0, X**2, num)

# fit OLS
XZ = sm.add_constant(np.column_stack((X, Z)))
model = sm.OLS(Y, XZ).fit()

# grab the (model‐based) standard error of β_X
se_model = model.bse[1]   # index 1 corresponds to X
print("Model-based SE of β_X:", se_model)

Model-based SE of β_X: 0.03376468368033493


In [23]:
import numpy as np
import statsmodels.api as sm

def one_beta():
    Z = np.random.normal(0, 1, num)
    X = Z + np.random.normal(0, 1, num)
    Y = 1.5*X + 2.3*Z + np.random.normal(0, X**2, num)
    XZ = sm.add_constant(np.column_stack((X, Z)))
    return sm.OLS(Y, XZ).fit().params[1]  # β̂_X

# run 100 MC replications
num = 10_000
M = 100
betas = np.array([one_beta() for _ in range(M)])

se_empirical = betas.std(ddof=1)
print("Empirical SE of β_X (across 100 sims):", se_empirical)

Empirical SE of β_X (across 100 sims): 0.05163886880051964
