<a href="https://colab.research.google.com/github/sokrypton/7.571/blob/main/L2/CLT_Bootstrap.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Normal Distribution

In [None]:
# import libraries
import numpy as np
import matplotlib.pyplot as plt

In [None]:
mu = 2.0
sigma = 4.0


n = 100
samples = np.random.normal(loc=mu, scale=sigma, size=n)
plt.hist(samples, bins=10)
plt.show()

In [None]:
# sample mean (estimate of μ)
x_hat = np.mean(samples)
print(x_hat)

In [None]:
# estimated standard error
SE = np.std(samples)/np.sqrt(n)
print(SE)

In [None]:
# Repeat the experiment many times
num_experiments = 1000
sample_means = []

for i in range(num_experiments):
    samples = np.random.normal(loc=mu, scale=sigma, size=n)
    x_hat = np.mean(samples)
    sample_means.append(x_hat)

sample_means = np.array(sample_means)

In [None]:
# Plot the distribution of sample means
plt.hist(sample_means, bins=30, edgecolor='white')
plt.axvline(mu, color='red', linestyle='--', label=f'μ = {mu}')

# Add std lines
theoretical_se = sigma/np.sqrt(n)
plt.axvline(mu - theoretical_se, color='orange', linestyle='--', label=f'μ ± σ/√n')
plt.axvline(mu + theoretical_se, color='orange', linestyle='--')

plt.xlabel('Sample Mean')
plt.title('Distribution of Sample Means')
plt.legend()
plt.show()

print("Verifying CLT")
print("=" * 40)
print(f"Observed std of sample means:  {np.std(sample_means):.4f}")
print(f"Theoretical σ/√n:              {sigma/np.sqrt(n):.4f}")

## Exponential Distribution

In [None]:
# Let's try a different distribution
lam = 0.5
n = 100
samples = np.random.exponential(scale=1/lam, size=n)
plt.hist(samples, bins=10)
plt.title('Exponential Distribution (NOT normal!)')
plt.show()

In [None]:
num_experiments = 1000
sample_means = []

for i in range(num_experiments):
    samples = np.random.exponential(scale=1/lam, size=n)
    sample_means.append(np.mean(samples))

sample_means = np.array(sample_means)

In [None]:
# For exponential with rate λ: μ = 1/λ, σ = 1/λ
mu = 1/lam
sigma = 1/lam

# Plot the distribution of sample means
plt.hist(sample_means, bins=30, edgecolor='white')
plt.axvline(mu, color='red', linestyle='--', label=f'μ = {mu}')

# Add std lines
theoretical_se = sigma/np.sqrt(n)
plt.axvline(mu - theoretical_se, color='orange', linestyle='--', label=f'μ ± σ/√n')
plt.axvline(mu + theoretical_se, color='orange', linestyle='--')

plt.xlabel('Sample Mean')
plt.title('Distribution of Sample Means (normal!)')
plt.legend()
plt.show()

print("Verifying CLT")
print("=" * 40)
print(f"Observed std of sample means:  {np.std(sample_means):.4f}")
print(f"Theoretical σ/√n:              {sigma/np.sqrt(n):.4f}")

#BOOTSTRAP

In [None]:
## Step 1: The Problem

# In real life, you only have ONE sample
# And you DON'T know the true μ or σ

lam = 0.5  # In real life, you wouldn't know this!
n = 100
my_sample = np.random.exponential(scale=1/lam, size=n)

print("In real life, all you have is your sample:")
print(f"Sample mean (x̄): {np.mean(my_sample):.3f}")
print(f"Sample std (s):  {np.std(my_sample):.3f}")
print(f"Sample size (n): {n}")

In [None]:
## Step 2: CLT Approach Works for the Mean

# We can estimate SE using the CLT formula
SE = np.std(my_sample) / np.sqrt(n)
print(f"SE of mean (using formula): {SE:.4f}")

In [None]:
## Step 3: But What About Other Statistics?

# What if we want SE of the median?
sample_median = np.median(my_sample)
print(f"Sample median: {sample_median:.3f}")
print(f"SE of median:  ???")  # No formula!

In [None]:
## Step 4: The Bootstrap Idea

# Key insight: Your sample is your best estimate of the population
# So... what if we resample FROM our sample?

# This is called resampling WITH REPLACEMENT
resample = np.random.choice(my_sample, size=n, replace=True)

print("Original sample (first 10):", my_sample[:10].round(2))
print("One resample (first 10):   ", resample[:10].round(2))


In [None]:
## Step 5: Do It Many Times

# Resample many times, calculate mean each time
num_resamples = 1000
bootstrap_means = []

for i in range(num_resamples):
    resample = np.random.choice(my_sample, size=n, replace=True)
    bootstrap_means.append(np.mean(resample))

bootstrap_means = np.array(bootstrap_means)

In [None]:
## Step 6: The Bootstrap SE

# The std of bootstrap means ≈ SE of the mean!
bootstrap_SE = np.std(bootstrap_means)
formula_SE = np.std(my_sample) / np.sqrt(n)

print("SE of the Mean")
print("=" * 40)
print(f"Bootstrap SE:     {bootstrap_SE:.4f}")
print(f"Formula SE (s/√n): {formula_SE:.4f}")
print("\nThey match! Bootstrap works.")

In [None]:
## Step 7: Visualize It

plt.hist(bootstrap_means, bins=30, edgecolor='white')
plt.axvline(np.mean(my_sample), color='red', linestyle='--', label='Sample mean')
plt.xlabel('Bootstrap Sample Mean')
plt.title('Bootstrap Distribution of the Mean')
plt.legend()
plt.show()

In [None]:
## Step 8: Now the Magic — SE of the Median!

# No formula exists, but bootstrap doesn't care!
bootstrap_medians = []

for i in range(num_resamples):
    resample = np.random.choice(my_sample, size=n, replace=True)
    bootstrap_medians.append(np.median(resample))

bootstrap_medians = np.array(bootstrap_medians)

print("SE of the Median")
print("=" * 40)
print(f"Sample median:  {np.median(my_sample):.4f}")
print(f"Bootstrap SE:   {np.std(bootstrap_medians):.4f}")

In [None]:
## Step 8b: Prove Bootstrap Gives Correct SE for Median

# To PROVE bootstrap works, we need to know the "true" SE
# Do the real experiment many times (just like we did for CLT)

num_experiments = 1000
sample_medians = []

for i in range(num_experiments):
    samples = np.random.exponential(scale=1/lam, size=n)
    sample_medians.append(np.median(samples))

sample_medians = np.array(sample_medians)

true_SE_median = np.std(sample_medians)

print("Verifying Bootstrap for the Median")
print("=" * 45)
print(f"True SE (from 1000 experiments):  {true_SE_median:.4f}")
print(f"Bootstrap SE (from ONE sample):   {np.std(bootstrap_medians):.4f}")
print()
print("Bootstrap estimates the SE using only ONE sample!")