<a href="https://colab.research.google.com/github/sokrypton/7.571/blob/main/L2/Anscombe_Bootstrap.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Anscombe's Quartet + Bootstrap

**The question:** Four datasets have the same summary statistics... but should we trust them equally?

In [None]:
import numpy as np
import matplotlib.pyplot as plt

## Anscombe's Quartet

Four datasets that are statistically identical but visually very different.

In [None]:
# Anscombe's quartet data
anscombe = {
    'I': {
        'x': np.array([10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5]),
        'y': np.array([8.04, 6.95, 7.58, 8.81, 8.33, 9.96, 7.24, 4.26, 10.84, 4.82, 5.68])
    },
    'II': {
        'x': np.array([10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5]),
        'y': np.array([9.14, 8.14, 8.74, 8.77, 9.26, 8.10, 6.13, 3.10, 9.13, 7.26, 4.74])
    },
    'III': {
        'x': np.array([10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5]),
        'y': np.array([7.46, 6.77, 12.74, 7.11, 7.81, 8.84, 6.08, 5.39, 8.15, 6.42, 5.73])
    },
    'IV': {
        'x': np.array([8, 8, 8, 8, 8, 8, 8, 19, 8, 8, 8]),
        'y': np.array([6.58, 5.76, 7.71, 8.84, 8.47, 7.04, 5.25, 12.50, 5.56, 7.91, 6.89])
    }
}

## They all have the same statistics!

In [None]:
def calc_correlation(x, y):
    """Calculate Pearson correlation coefficient"""
    return np.corrcoef(x, y)[0, 1]

def calc_slope(x, y):
    """Calculate slope of linear regression"""
    return np.cov(x, y)[0, 1] / np.var(x)

print("Summary Statistics for All Four Datasets")
print("=" * 50)
print(f"{'Dataset':<10} {'Mean X':<10} {'Mean Y':<10} {'Correlation':<12} {'Slope'}")
print("-" * 50)

for name, data in anscombe.items():
    mx = np.mean(data['x'])
    my = np.mean(data['y'])
    corr = calc_correlation(data['x'], data['y'])
    slope = calc_slope(data['x'], data['y'])
    print(f"{name:<10} {mx:<10.2f} {my:<10.2f} {corr:<12.3f} {slope:.3f}")

print("\nThey're all (nearly) identical!")

## But they look completely different!

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(10, 8))
axes = axes.flatten()

for i, (name, data) in enumerate(anscombe.items()):
    ax = axes[i]
    ax.scatter(data['x'], data['y'], s=60)

    # Add regression line
    slope = calc_slope(data['x'], data['y'])
    intercept = np.mean(data['y']) - slope * np.mean(data['x'])
    x_line = np.linspace(3, 20, 100)
    ax.plot(x_line, slope * x_line + intercept, 'r--', alpha=0.7)

    ax.set_title(f"Dataset {name}\nr = {calc_correlation(data['x'], data['y']):.3f}")
    ax.set_xlabel('x')
    ax.set_ylabel('y')
    ax.set_xlim(3, 20)
    ax.set_ylim(3, 14)

plt.tight_layout()
plt.show()

print("Same correlation (0.816), but very different relationships!")

## Bootstrap: How confident should we be?

Let's bootstrap the correlation for each dataset and see how stable it is.

In [None]:
def bootstrap_correlation(x, y, num_resamples=1000):
    """Bootstrap the correlation coefficient"""
    n = len(x)
    bootstrap_corrs = []

    for _ in range(num_resamples):
        # Resample indices with replacement
        idx = np.random.choice(n, size=n, replace=True)
        x_resample = x[idx]
        y_resample = y[idx]

        # Add tiny noise to avoid zero variance issues
        x_resample = x_resample + np.random.normal(0, 0.001, size=n)
        y_resample = y_resample + np.random.normal(0, 0.001, size=n)

        # Calculate correlation on resample
        corr = calc_correlation(x_resample, y_resample)
        bootstrap_corrs.append(corr)

    return np.array(bootstrap_corrs)

In [None]:
# Bootstrap all four datasets
bootstrap_results = {}

for name, data in anscombe.items():
    bootstrap_results[name] = bootstrap_correlation(data['x'], data['y'])

print("Bootstrap Results")
print("=" * 60)
print(f"{'Dataset':<10} {'Correlation':<15} {'Bootstrap SE':<15} {'95% CI'}")
print("-" * 60)

for name, boot_corrs in bootstrap_results.items():
    original_corr = calc_correlation(anscombe[name]['x'], anscombe[name]['y'])
    se = np.std(boot_corrs)
    ci_low = np.percentile(boot_corrs, 2.5)
    ci_high = np.percentile(boot_corrs, 97.5)
    print(f"{name:<10} {original_corr:<15.3f} {se:<15.3f} [{ci_low:.3f}, {ci_high:.3f}]")

## Visualize the bootstrap distributions

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(10, 8))
axes = axes.flatten()

for i, (name, boot_corrs) in enumerate(bootstrap_results.items()):
    ax = axes[i]
    ax.hist(boot_corrs, bins=30, edgecolor='white', alpha=0.7)

    # Add original correlation line
    original_corr = calc_correlation(anscombe[name]['x'], anscombe[name]['y'])
    ax.axvline(original_corr, color='red', linestyle='--', linewidth=2)

    se = np.std(boot_corrs)
    ax.set_title(f"Dataset {name}\nSE = {se:.3f}")
    ax.set_xlabel('Correlation')
    ax.set_xlim(-0.2, 1.1)

plt.tight_layout()
plt.show()

## The Lesson

All four datasets have the **same correlation (0.816)**.

But bootstrap reveals we shouldn't trust them equally!

| Dataset | What's happening | Bootstrap SE | Trust the correlation? |
|---------|------------------|--------------|------------------------|
| I | Real linear relationship | ~0.10 | Yes |
| II | Curved, not linear | ~0.12 | Misleading! |
| III | One outlier at x=13 | ~0.09 | Mostly |
| IV | One point creates all correlation | ~0.47 | **No!** |

Dataset IV's 95% CI spans from **negative to positive** correlation!

**Bootstrap SE quantifies how stable your estimate is â€” even when point estimates look the same!**