# 06 - Statistics Advanced (Data Engineering)

Apply statistical techniques to monitoring, experiments, and anomaly detection.


## 1. Sampling and the Central Limit Theorem


In [1]:
import numpy as np

rng = np.random.default_rng(42)
population = rng.normal(loc=150, scale=20, size=10000)

sample_means = [rng.choice(population, size=50, replace=False).mean() for _ in range(1000)]
print(f'Population mean: {population.mean():.2f}')
print(f'Sample means mean: {np.mean(sample_means):.2f}')
print(f'Sample means std: {np.std(sample_means):.2f}')


Population mean: 149.80
Sample means mean: 149.89
Sample means std: 2.75


## 2. Bootstrap confidence interval


In [2]:
def bootstrap_mean_ci(values, num_samples=3000, ci=0.95, seed=7):
    rng = np.random.default_rng(seed)
    samples = rng.choice(values, size=(num_samples, len(values)), replace=True)
    means = samples.mean(axis=1)
    lower = np.percentile(means, (1 - ci) / 2 * 100)
    upper = np.percentile(means, (1 + ci) / 2 * 100)
    return lower, upper

lower_ci, upper_ci = bootstrap_mean_ci(population[:200])
print(f'95% CI for mean: {lower_ci:.2f} to {upper_ci:.2f}')


95% CI for mean: 147.05 to 151.77


## 3. Permutation test for A/B experiments


In [None]:
rng = np.random.default_rng(99)

control = rng.normal(loc=120, scale=10, size=60)
treatment = rng.normal(loc=126, scale=12, size=60)

observed_diff = treatment.mean() - control.mean()
combined = np.concatenate([control, treatment])

perm_diffs = []
for _ in range(2000):
    rng.shuffle(combined)
    perm_control = combined[: len(control)]
    perm_treatment = combined[len(control) :]
    perm_diffs.append(perm_treatment.mean() - perm_control.mean())

p_value = np.mean(np.abs(perm_diffs) >= abs(observed_diff))
print(f'Observed diff: {observed_diff:.2f}')
print(f'Permutation p-value: {p_value:.3f}')


## 4. Effect size (practical significance)


In [None]:
pooled_std = np.std(combined, ddof=1)
effect_size = observed_diff / pooled_std
print(f"Effect size (Cohen's d): {effect_size:.2f}")


## 5. Moving average control limits


In [None]:
rng = np.random.default_rng(123)
time_series = rng.normal(loc=100, scale=5, size=30)
moving_average = np.convolve(time_series, np.ones(5) / 5, mode='valid')

upper_limit = moving_average.mean() + 2 * moving_average.std()
lower_limit = moving_average.mean() - 2 * moving_average.std()

print(f'Control limits: {lower_limit:.2f} to {upper_limit:.2f}')


## Next Steps

Continue your learning with:
- **07_data_quality_checks.ipynb** - Data profiling and validation
- **08_time_series_basics.ipynb** - Time series fundamentals
