# Homework 2

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

For questions 1 and 2: 
- Do a regression to estimate the fixed effect of each group. We assume that there is one single linear coefficient for all the data, plus the fixed effect of each group. Use the file homework_2.1.csv.  The variables G1, G2, and G3 are the outcomes and the time is the treatment.

In [2]:
df = pd.read_csv('homework_2.1.csv', index_col=0)
df

Unnamed: 0,time,G1,G2,G3
0,0,0.882026,1.441575,0.065409
1,1,0.210079,-0.163880,0.140310
2,2,0.509369,-0.115242,0.819830
3,3,1.150447,1.014698,0.607632
4,4,0.973779,-0.046562,0.610066
...,...,...,...,...
95,95,1.303287,1.364227,1.768446
96,96,0.965250,1.845895,1.258862
97,97,1.862935,1.881752,1.511477
98,98,1.043456,2.561618,1.030275


In [4]:
# 1b. Melt into long form so that each (group, time) is its own row
#    Resulting columns: [time, group, outcome]
df_long = df.melt(
    id_vars="time",
    value_vars=["G1", "G2", "G3"],
    var_name="group",
    value_name="outcome"
)
print(df_long.head())

   time group   outcome
0     0    G1  0.882026
1     1    G1  0.210079
2     2    G1  0.509369
3     3    G1  1.150447
4     4    G1  0.973779


2. Compute group‐specific means of both “outcome” and “time”

We need:
- $\bar{y}_g$ = mean outcome for each group $g\in\{G1,G2,G3\}$.
- $\bar{t}_g$ = mean “time” for each group g.

In [5]:
# 2a. Compute group means in a separate DataFrame
group_means = df_long.groupby("group").agg(
    mean_outcome = ("outcome", "mean"),
    mean_time    = ("time",    "mean")
).reset_index()

# group_means looks like:
#   group | mean_outcome | mean_time
#   ------|--------------|----------
#    "G1" |     0.524904  |   49.5
#    "G2" |     1.036006  |   49.5
#    "G3" |     0.715384  |   49.5

# 2b. Merge those group‐means back into the long DataFrame
df_long = df_long.merge(group_means, on="group")

# Now df_long columns are:
#   time | group | outcome | mean_outcome | mean_time

In [6]:
df_long

Unnamed: 0,time,group,outcome,mean_outcome,mean_time
0,0,G1,0.882026,0.524904,49.5
1,1,G1,0.210079,0.524904,49.5
2,2,G1,0.509369,0.524904,49.5
3,3,G1,1.150447,0.524904,49.5
4,4,G1,0.973779,0.524904,49.5
...,...,...,...,...,...
295,95,G3,1.768446,0.715384,49.5
296,96,G3,1.258862,0.715384,49.5
297,97,G3,1.511477,0.715384,49.5
298,98,G3,1.030275,0.715384,49.5



Because each group (“G1”, “G2”, “G3”) has exactly 100 time‐points $\{0,1,2,\dots,99 \} $, you’ll see $ \text{ meantime } = 49.5$ for all three groups.

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

# 1) LOAD THE DATASET
df = pd.read_csv("homework_2.2.csv")   # columns: [X, Y, Z]

# Quick sanity check:
print(df.head())
#    X          Y         Z
# 0  0   1.182435 -0.725820
# 1  0   2.714474  0.563476
# 2  0   0.077612 -0.435632
# 3  0  -0.154449 -0.104553
# 4  0  22.298992 -2.321273
# …


# 2) SET UP BOOTSTRAP PARAMETERS
n_boot = 1000
np.random.seed(42)   # for reproducibility

# Lists to store each iteration’s estimate:
naive_effects      = []
regression_effects = []


# 3) BOOTSTRAP LOOP
for _ in range(n_boot):
    # 3a. Sample with replacement
    bs_sample = df.sample(frac=1, replace=True)
    
    # 3b. Naive mean-difference:
    mean_treated   = bs_sample.loc[bs_sample["X"] == 1, "Y"].mean()
    mean_untreated = bs_sample.loc[bs_sample["X"] == 0, "Y"].mean()
    naive_effects.append(mean_treated - mean_untreated)
    
    # 3c. Regression-adjusted effect:
    #      Refit OLS: Y ~ X + Z (with intercept)
    X_reg = sm.add_constant(bs_sample[["X", "Z"]])
    y_reg = bs_sample["Y"]
    model  = sm.OLS(y_reg, X_reg).fit()
    
    #      Extract the coefficient on “X”
    regression_effects.append(model.params["X"])


# 4) COMPUTE SUMMARIES

# 4a. Naive effect summary
naive_arr = np.array(naive_effects)
print("\nNaive mean-difference (bootstrap) :")
print(f"  • Mean estimate = {naive_arr.mean():.4f}")
print(f"  • Std. Dev.     = {naive_arr.std(ddof=1):.4f}")
ci_lo = np.quantile(naive_arr, 0.025)
ci_hi = np.quantile(naive_arr, 0.975)
print(f"  • 95% CI       = ({ci_lo:.4f}, {ci_hi:.4f})")


# 4b. Regression-adjusted effect summary
reg_arr = np.array(regression_effects)
print("\nRegression-adjusted (Y ~ X + Z) (bootstrap) :")
print(f"  • Mean estimate = {reg_arr.mean():.4f}")
print(f"  • Std. Dev.     = {reg_arr.std(ddof=1):.4f}")
ci_lo2 = np.quantile(reg_arr, 0.025)
ci_hi2 = np.quantile(reg_arr, 0.975)
print(f"  • 95% CI       = ({ci_lo2:.4f}, {ci_hi2:.4f})")


# 5) OPTIONAL: LOOK AT SKEWNESS OR HISTOGRAM
from scipy.stats import skew
print("\nSkewness:")
print(f"  • Naive distribution skew = {skew(naive_arr):.4f}")
print(f"  • Reg-adj distribution skew = {skew(reg_arr):.4f}")


# 6) OPTIONAL: PLOT BOTH HISTOGRAMS (UNCOMMENT IF YOU WANT TO SEE PLOTS)
# import matplotlib.pyplot as plt
# plt.figure(figsize=(12,4))
# plt.subplot(1,2,1)
# plt.hist(naive_arr, bins=30, edgecolor="k", alpha=0.7)
# plt.title("Bootstrap: Naive mean-difference")
# plt.xlabel("Effect estimate"); plt.ylabel("Frequency")
# 
# plt.subplot(1,2,2)
# plt.hist(reg_arr, bins=30, edgecolor="k", alpha=0.7)
# plt.title("Bootstrap: Regression-adjusted (X ~ Z)")
# plt.xlabel("Effect estimate"); plt.ylabel("Frequency")
# plt.tight_layout()
# plt.show()

   Unnamed: 0  X          Y         Z
0           0  0   1.182435 -0.725820
1           1  0   2.714474  0.563476
2           2  0   0.077612 -0.435632
3           3  0  -0.154449 -0.104553
4           4  0  22.298992 -2.321273

Naive mean-difference (bootstrap) :
  • Mean estimate = 2.9208
  • Std. Dev.     = 0.1767
  • 95% CI       = (2.5783, 3.2827)

Regression-adjusted (Y ~ X + Z) (bootstrap) :
  • Mean estimate = 2.8197
  • Std. Dev.     = 0.1694
  • 95% CI       = (2.4819, 3.1531)

Skewness:
  • Naive distribution skew = 0.0872
  • Reg-adj distribution skew = 0.0270


In [2]:
df

Unnamed: 0.1,Unnamed: 0,X,Y,Z
0,0,0,1.182435,-0.725820
1,1,0,2.714474,0.563476
2,2,0,0.077612,-0.435632
3,3,0,-0.154449,-0.104553
4,4,0,22.298992,-2.321273
...,...,...,...,...
9995,9995,0,0.019371,-0.409462
9996,9996,0,2.581533,0.545860
9997,9997,0,0.209599,-0.486216
9998,9998,0,16.829356,-2.045500
