<a href="https://colab.research.google.com/github/sokrypton/7.571/blob/main/L2/CLT_Bootstrap.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Central Limit Theorem & Bootstrap

**Goals:**
1. Understand the Central Limit Theorem (CLT) - a closed-form solution to estimate standard error
2. Understand Bootstrap - a computational approach to estimate standard error

---

In [None]:
# import libraries
import numpy as np
import matplotlib.pyplot as plt

# The Problem: Comparing Two Populations

Imagine you're comparing gene expression between control and treatment groups.

You measure samples from each group and calculate the means. But how do you know if the difference is **real** or just **sampling noise**?

In [None]:
# Two populations we want to compare
mu_control = 10.0
mu_treatment = 12.0
sigma = 4.0  # same variability in both

# Take samples from each
n = 50
control_sample = np.random.normal(loc=mu_control, scale=sigma, size=n)
treatment_sample = np.random.normal(loc=mu_treatment, scale=sigma, size=n)

# Calculate means
control_mean = np.mean(control_sample)
treatment_mean = np.mean(treatment_sample)

print(f"Control mean:   {control_mean:.2f}")
print(f"Treatment mean: {treatment_mean:.2f}")
print(f"Difference:     {treatment_mean - control_mean:.2f}")
print()
print("But wait... how do we know this difference is real?")
print("What if we just got unlucky with our samples?")

## This is why we need Standard Error!

**Standard Error (SE)** tells us how much our estimate might vary if we took a different sample.

Formula: **SE = s / sqrt(n)**

In [None]:
# Calculate SE for each group
control_SE = np.std(control_sample) / np.sqrt(n)
treatment_SE = np.std(treatment_sample) / np.sqrt(n)

print(f"Control:   {control_mean:.2f} +/- {control_SE:.2f}")
print(f"Treatment: {treatment_mean:.2f} +/- {treatment_SE:.2f}")
print()
print("Now we can see how confident we are in each estimate!")

## But where does this formula come from?

**The Central Limit Theorem** tells us that SE = sigma / sqrt(n)

Let's prove it!

---

# Part 1: Central Limit Theorem (CLT)

## Normal Distribution

In [None]:
# Define population parameters
mu = 2.0      # population mean
sigma = 4.0   # population std

# Take ONE sample
n = 100
samples = np.random.normal(loc=mu, scale=sigma, size=n)
plt.hist(samples, bins=10)
plt.title('One Sample from Normal Distribution')
plt.show()

In [None]:
# Sample mean (our estimate of mu)
x_hat = np.mean(samples)
print(f"Sample mean: {x_hat:.4f}")

In [None]:
# Estimated standard error using CLT formula: SE = s / sqrt(n)
SE = np.std(samples) / np.sqrt(n)
print(f"Standard Error: {SE:.4f}")

### Let's verify CLT by repeating the experiment many times

In [None]:
# Repeat the experiment many times
num_experiments = 1000
sample_means = []

for i in range(num_experiments):
    samples = np.random.normal(loc=mu, scale=sigma, size=n)
    x_hat = np.mean(samples)
    sample_means.append(x_hat)

sample_means = np.array(sample_means)

In [None]:
# Plot the distribution of sample means
plt.hist(sample_means, bins=30, edgecolor='white')
plt.axvline(mu, color='red', linestyle='--', label=f'mu = {mu}')

# Add std lines
theoretical_se = sigma / np.sqrt(n)
plt.axvline(mu - theoretical_se, color='orange', linestyle='--', label='mu +/- sigma/sqrt(n)')
plt.axvline(mu + theoretical_se, color='orange', linestyle='--')

plt.xlabel('Sample Mean')
plt.title('Distribution of Sample Means')
plt.legend()
plt.show()

# Take one sample to show s/sqrt(n)
one_sample = np.random.normal(loc=mu, scale=sigma, size=n)
estimated_se = np.std(one_sample) / np.sqrt(n)

print("Verifying CLT")
print("=" * 50)
print(f"Observed std of sample means:   {np.std(sample_means):.4f}")
print(f"Theoretical SE (sigma/sqrt(n)): {sigma/np.sqrt(n):.4f}  <- uses true sigma")
print(f"Estimated SE (s/sqrt(n)):       {estimated_se:.4f}  <- uses sample s")
print()
print("In real life, we don't know sigma, so we use s/sqrt(n)")

## Exponential Distribution

CLT works for ANY distribution - let's try one that is clearly NOT normal!

In [None]:
# Let's try a different distribution
lam = 0.5
n = 100
samples = np.random.exponential(scale=1/lam, size=n)
plt.hist(samples, bins=10)
plt.title('Exponential Distribution (NOT normal!)')
plt.show()

In [None]:
# Repeat the experiment many times
num_experiments = 1000
sample_means = []

for i in range(num_experiments):
    samples = np.random.exponential(scale=1/lam, size=n)
    sample_means.append(np.mean(samples))

sample_means = np.array(sample_means)

In [None]:
# For exponential with rate lambda: mu = 1/lambda, sigma = 1/lambda
mu = 1/lam
sigma = 1/lam

# Plot the distribution of sample means
plt.hist(sample_means, bins=30, edgecolor='white')
plt.axvline(mu, color='red', linestyle='--', label=f'mu = {mu}')

# Add std lines
theoretical_se = sigma / np.sqrt(n)
plt.axvline(mu - theoretical_se, color='orange', linestyle='--', label='mu +/- sigma/sqrt(n)')
plt.axvline(mu + theoretical_se, color='orange', linestyle='--')

plt.xlabel('Sample Mean')
plt.title('Distribution of Sample Means (normal!)')
plt.legend()
plt.show()

# Take one sample to show s/sqrt(n)
one_sample = np.random.exponential(scale=1/lam, size=n)
estimated_se = np.std(one_sample) / np.sqrt(n)

print("Verifying CLT")
print("=" * 50)
print(f"Observed std of sample means:   {np.std(sample_means):.4f}")
print(f"Theoretical SE (sigma/sqrt(n)): {sigma/np.sqrt(n):.4f}  <- uses true sigma")
print(f"Estimated SE (s/sqrt(n)):       {estimated_se:.4f}  <- uses sample s")
print()
print("In real life, we don't know sigma, so we use s/sqrt(n)")

## So What Does This Mean?

We did 1000 experiments to prove CLT works.

**But in real research, you only get ONE sample!**

CLT tells us:
- We can estimate standard error from just one sample using: `SE = s / sqrt(n)`
- This tells us how much to trust our estimate of the mean

**But what if we want SE of something other than the mean?**

---
# Part 2: Bootstrap

## Step 1: The Problem

In [None]:
# In real life, you only have ONE sample
# And you DON'T know the true mu or sigma

lam = 0.5  # In real life, you wouldn't know this!
n = 100
my_sample = np.random.exponential(scale=1/lam, size=n)

print("In real life, all you have is your sample:")
print(f"Sample mean: {np.mean(my_sample):.3f}")
print(f"Sample std:  {np.std(my_sample):.3f}")
print(f"Sample size: {n}")

## Step 2: CLT Approach Works for the Mean

In [None]:
# We can estimate SE using the CLT formula
SE = np.std(my_sample) / np.sqrt(n)
print(f"SE of mean (using formula): {SE:.4f}")

## Step 3: But What About Other Statistics?

In [None]:
# What if we want SE of the median?
sample_median = np.median(my_sample)
print(f"Sample median: {sample_median:.3f}")
print(f"SE of median:  ???")  # No formula!

## Step 4: The Bootstrap Idea

Key insight: **Your sample is your best estimate of the population.**

So... what if we resample FROM our sample?

In [None]:
# This is called resampling WITH REPLACEMENT
resample = np.random.choice(my_sample, size=n, replace=True)

print("Original sample (first 10):", my_sample[:10].round(2))
print("One resample (first 10):   ", resample[:10].round(2))
print("\nNotice: some values repeat, some are missing!")

## Step 5: Do It Many Times

In [None]:
# Resample many times, calculate mean each time
num_resamples = 1000
bootstrap_means = []

for i in range(num_resamples):
    resample = np.random.choice(my_sample, size=n, replace=True)
    bootstrap_means.append(np.mean(resample))

bootstrap_means = np.array(bootstrap_means)

## Step 6: The Bootstrap SE

In [None]:
# The std of bootstrap means = SE of the mean!
bootstrap_SE = np.std(bootstrap_means)
formula_SE = np.std(my_sample) / np.sqrt(n)

print("SE of the Mean")
print("=" * 40)
print(f"Bootstrap SE:        {bootstrap_SE:.4f}")
print(f"Formula SE (s/sqrt(n)): {formula_SE:.4f}")
print("\nThey match! Bootstrap works.")

## Step 7: Visualize It

In [None]:
plt.hist(bootstrap_means, bins=30, edgecolor='white')
plt.axvline(np.mean(my_sample), color='red', linestyle='--', label='Sample mean')
plt.xlabel('Bootstrap Sample Mean')
plt.title('Bootstrap Distribution of the Mean')
plt.legend()
plt.show()

## Step 8: Now the Magic - SE of the Median!

No formula exists, but bootstrap doesn't care!

In [None]:
bootstrap_medians = []

for i in range(num_resamples):
    resample = np.random.choice(my_sample, size=n, replace=True)
    bootstrap_medians.append(np.median(resample))

bootstrap_medians = np.array(bootstrap_medians)

print("SE of the Median")
print("=" * 40)
print(f"Sample median:  {np.median(my_sample):.4f}")
print(f"Bootstrap SE:   {np.std(bootstrap_medians):.4f}")
print("\nNo formula needed!")

## Step 9: Prove Bootstrap Gives Correct SE for Median

To verify bootstrap works, let's compare to the "true" SE (from many experiments).

In [None]:
# Do the real experiment many times (just like we did for CLT)
num_experiments = 1000
sample_medians = []

for i in range(num_experiments):
    samples = np.random.exponential(scale=1/lam, size=n)
    sample_medians.append(np.median(samples))

sample_medians = np.array(sample_medians)
true_SE_median = np.std(sample_medians)

print("Verifying Bootstrap for the Median")
print("=" * 45)
print(f"True SE (from 1000 experiments):  {true_SE_median:.4f}")
print(f"Bootstrap SE (from ONE sample):   {np.std(bootstrap_medians):.4f}")
print()

---
# Summary

## Central Limit Theorem (CLT)
- **What:** Distribution of sample means approaches normal, regardless of population shape
- **Formula:** SE = sigma / sqrt(n)  (estimate with s / sqrt(n))
- **Limitation:** Only works for the mean

## Bootstrap
- **What:** Computational method to estimate SE of ANY statistic
- **How:** Resample with replacement -> Calculate statistic -> Repeat many times -> Take std
- **Advantage:** Works for mean, median, or any statistic you can compute!

| Method | SE Formula | Works For |
|--------|------------|----------|
| CLT | s / sqrt(n) | Mean only |
| Bootstrap | std(bootstrap statistics) | Anything! |

# BONUS: Anscombe's Quartet + Bootstrap

**The question:** Four datasets have the same summary statistics... but should we trust them equally?

In [None]:
import numpy as np
import matplotlib.pyplot as plt

## Anscombe's Quartet

Four datasets that are statistically identical but visually very different.

In [None]:
# Anscombe's quartet data
anscombe = {
    'I': {
        'x': np.array([10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5]),
        'y': np.array([8.04, 6.95, 7.58, 8.81, 8.33, 9.96, 7.24, 4.26, 10.84, 4.82, 5.68])
    },
    'II': {
        'x': np.array([10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5]),
        'y': np.array([9.14, 8.14, 8.74, 8.77, 9.26, 8.10, 6.13, 3.10, 9.13, 7.26, 4.74])
    },
    'III': {
        'x': np.array([10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5]),
        'y': np.array([7.46, 6.77, 12.74, 7.11, 7.81, 8.84, 6.08, 5.39, 8.15, 6.42, 5.73])
    },
    'IV': {
        'x': np.array([8, 8, 8, 8, 8, 8, 8, 19, 8, 8, 8]),
        'y': np.array([6.58, 5.76, 7.71, 8.84, 8.47, 7.04, 5.25, 12.50, 5.56, 7.91, 6.89])
    }
}

## They all have the same statistics!

In [None]:
def calc_correlation(x, y):
    """Calculate Pearson correlation coefficient"""
    return np.corrcoef(x, y)[0, 1]

def calc_slope(x, y):
    """Calculate slope of linear regression"""
    return np.cov(x, y)[0, 1] / np.var(x)

print("Summary Statistics for All Four Datasets")
print("=" * 50)
print(f"{'Dataset':<10} {'Mean X':<10} {'Mean Y':<10} {'Correlation':<12} {'Slope'}")
print("-" * 50)

for name, data in anscombe.items():
    mx = np.mean(data['x'])
    my = np.mean(data['y'])
    corr = calc_correlation(data['x'], data['y'])
    slope = calc_slope(data['x'], data['y'])
    print(f"{name:<10} {mx:<10.2f} {my:<10.2f} {corr:<12.3f} {slope:.3f}")

print("\nThey're all (nearly) identical!")

## But they look completely different!

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(10, 8))
axes = axes.flatten()

for i, (name, data) in enumerate(anscombe.items()):
    ax = axes[i]
    ax.scatter(data['x'], data['y'], s=60)

    # Add regression line
    slope = calc_slope(data['x'], data['y'])
    intercept = np.mean(data['y']) - slope * np.mean(data['x'])
    x_line = np.linspace(3, 20, 100)
    ax.plot(x_line, slope * x_line + intercept, 'r--', alpha=0.7)

    ax.set_title(f"Dataset {name}\nr = {calc_correlation(data['x'], data['y']):.3f}")
    ax.set_xlabel('x')
    ax.set_ylabel('y')
    ax.set_xlim(3, 20)
    ax.set_ylim(3, 14)

plt.tight_layout()
plt.show()

print("Same correlation (0.816), but very different relationships!")

## Bootstrap: How confident should we be?

Let's bootstrap the correlation for each dataset and see how stable it is.

In [None]:
def bootstrap_correlation(x, y, num_resamples=1000):
    """Bootstrap the correlation coefficient"""
    n = len(x)
    bootstrap_corrs = []

    for _ in range(num_resamples):
        # Resample indices with replacement
        idx = np.random.choice(n, size=n, replace=True)
        x_resample = x[idx]
        y_resample = y[idx]

        # Add tiny noise to avoid zero variance issues
        x_resample = x_resample + np.random.normal(0, 0.001, size=n)
        y_resample = y_resample + np.random.normal(0, 0.001, size=n)

        # Calculate correlation on resample
        corr = calc_correlation(x_resample, y_resample)
        bootstrap_corrs.append(corr)

    return np.array(bootstrap_corrs)

In [None]:
# Bootstrap all four datasets
bootstrap_results = {}

for name, data in anscombe.items():
    bootstrap_results[name] = bootstrap_correlation(data['x'], data['y'])

print("Bootstrap Results")
print("=" * 60)
print(f"{'Dataset':<10} {'Correlation':<15} {'Bootstrap SE':<15} {'95% CI'}")
print("-" * 60)

for name, boot_corrs in bootstrap_results.items():
    original_corr = calc_correlation(anscombe[name]['x'], anscombe[name]['y'])
    se = np.std(boot_corrs)
    ci_low = np.percentile(boot_corrs, 2.5)
    ci_high = np.percentile(boot_corrs, 97.5)
    print(f"{name:<10} {original_corr:<15.3f} {se:<15.3f} [{ci_low:.3f}, {ci_high:.3f}]")

## Visualize the bootstrap distributions

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(10, 8))
axes = axes.flatten()

for i, (name, boot_corrs) in enumerate(bootstrap_results.items()):
    ax = axes[i]
    ax.hist(boot_corrs, bins=30, edgecolor='white', alpha=0.7)

    # Add original correlation line
    original_corr = calc_correlation(anscombe[name]['x'], anscombe[name]['y'])
    ax.axvline(original_corr, color='red', linestyle='--', linewidth=2)

    se = np.std(boot_corrs)
    ax.set_title(f"Dataset {name}\nSE = {se:.3f}")
    ax.set_xlabel('Correlation')
    ax.set_xlim(-1.0, 1.1)

plt.tight_layout()
plt.show()