<a href="https://colab.research.google.com/github/shatlykgurdov/3.1.2/blob/main/1d.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt

# -----------------------------
# Settings & true parameters
# -----------------------------
np.random.seed(123)

a_true = 1.0
b_true = 2.0
c_true = 3.0
sigma_e = 1.0
rho = 0.7            # correlation between X and Z
sample_sizes = [50, 200, 1000]
n_sim = 1000         # number of simulations per sample size

# -----------------------------
# Helper: simulate once and estimate models
# -----------------------------
def simulate_once(N):
    """
    Simulate one dataset from:
        Y = a + b*X + c*Z + e
    where X and Z are correlated, then estimate:
      - Full model: Y ~ X + Z
      - Omitted model: Y ~ X
    Returns b_hat_full, b_hat_omit.
    """
    # Generate X
    X = np.random.normal(0, 1, size=N)
    # Generate noise for Z
    u = np.random.normal(0, 1, size=N)
    # Make Z correlated with X
    Z = rho * X + np.sqrt(1 - rho**2) * u
    # Error term
    e = np.random.normal(0, sigma_e, size=N)
    # True model
    Y = a_true + b_true * X + c_true * Z + e

    # Full model: Y ~ X + Z
    X_full = np.column_stack([X, Z])
    X_full = sm.add_constant(X_full)
    model_full = sm.OLS(Y, X_full).fit()
    b_hat_full = model_full.params[1]   # coefficient of X

    # Omitted model: Y ~ X
    X_omit = sm.add_constant(X)
    model_omit = sm.OLS(Y, X_omit).fit()
    b_hat_omit = model_omit.params[1]

    return b_hat_full, b_hat_omit, model_full, model_omit

# -----------------------------
# Part 1: Single example run
# -----------------------------
print("=== Single example run (N = 200) ===")
b_full, b_omit, model_full, model_omit = simulate_once(200)
print("True b:", b_true)
print("Full model b_hat:", round(b_full, 4))
print("Omitted model b_hat:", round(b_omit, 4))
print("\nFull model summary:")
print(model_full.summary())
print("\nOmitted model summary:")
print(model_omit.summary())

# -----------------------------
# Part 2: Many simulations to study bias & variance
# -----------------------------
results = {}

for N in sample_sizes:
    b_full_list = []
    b_omit_list = []

    for _ in range(n_sim):
        b_f, b_o, _, _ = simulate_once(N)
        b_full_list.append(b_f)
        b_omit_list.append(b_o)

    results[N] = {
        "b_full_mean": np.mean(b_full_list),
        "b_full_std": np.std(b_full_list),
        "b_omit_mean": np.mean(b_omit_list),
        "b_omit_std": np.std(b_omit_list),
        "b_full_list": b_full_list,
        "b_omit_list": b_omit_list,
    }

# Print summary statistics
print("\n=== Simulation summary over", n_sim, "runs ===")
print("True b:", b_true)

for N in sample_sizes:
    print(f"\nSample size N = {N}")
    print("Full model:   mean(b_hat) =",
          round(results[N]["b_full_mean"], 3),
          ", std(b_hat) =",
          round(results[N]["b_full_std"], 3))
    print("Omitted model: mean(b_hat) =",
          round(results[N]["b_omit_mean"], 3),
          ", std(b_hat) =",
          round(results[N]["b_omit_std"], 3))

# -----------------------------
# Part 3: Plot distribution for one N
# -----------------------------
N_plot = 200   # choose which N to visualize
b_full_list = results[N_plot]["b_full_list"]
b_omit_list = results[N_plot]["b_omit_list"]

plt.hist(b_full_list, bins=30, alpha=0.5, label="Full model b_hat")
plt.hist(b_omit_list, bins=30, alpha=0.5, label="Omitted model b_hat")
plt.axvline(b_true, linestyle="--", label="True b")
plt.title(f"Distribution of b_hat (N = {N_plot})")
plt.xlabel("b_hat")
plt.ylabel("Frequency")
plt.legend()
plt.show()
