In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats

## Table of common statistical tests and their corresponding sampling versions

| Test          | Null Hypothesis | Stat Test                      | What to Sample                                   |
|---------------|-----------------|--------------------------------|--------------------------------------------------|
| 1 mean        | $\mu = \mu_0$   | t-test                         | Bootstrap samples (use confidence interval)      |
| 2 means       | $\mu_1 = \mu_2$ | 2 sample t-test                | Shuffle labels, i.e. permutation test            |
| 1 proportion  | $p = p_0$       | proportions test               | Directly from $p_0$                              |
| 2 proportions | $p_1 = p_2$     | Difference in proportions test | From $(p_1 + p_2) / 2$                           |

### 1 Mean test

Your friend insists that the average height for males in the US is 72 inches. You have data for 50 males randomly sampled from the US population, with an average height of 70.41 inches. Can you conclude that your friend is wrong?

In [8]:
np.random.seed(10)
data = np.random.normal(loc=70, scale=4, size=(50))
print(f"Mean of data: {np.round(np.mean(data), 2)}")

# Get results of one-sample t-test
results = stats.ttest_1samp(data, popmean=72)
print(f"CI from t-test: {results.confidence_interval()}")

# Draw bootstrap samples
B = 10000
bootstrap_means = []
for _ in range(B):
    bootstrap_data = np.random.choice(data, size=50, replace=True)
    bootstrap_means.append(np.mean(bootstrap_data))

bootstrap_ci = np.quantile(bootstrap_means, [0.025, 0.975])
print(f"CI from bootstrap: {bootstrap_ci}")


Mean of data: 70.41
CI from t-test: ConfidenceInterval(low=69.37534638293424, high=71.44454337840453)
CI from bootstrap: [69.40739916 71.41187632]


### 2 Means Test: Permutation Test

Your friend now insists that black men are taller than white men on average. Of your 50 samples, 15 are black and 35 are white. Are their average heights significantly different?

In [36]:
np.random.seed(11)
n_white = 75
n_black = 25

white_heights = np.random.normal(loc=70, scale=4, size=(n_white))
black_heights = np.random.normal(loc=72, scale=4, size=(n_black))
heights_df = pd.DataFrame({
    "race": ["white"]*n_white + ["black"]*n_black,
    "height": np.concatenate([white_heights, black_heights]),
})

means = heights_df.groupby("race")["height"].mean()
obs_means_diff = means["black"] - means["white"]
print("Means from data")
print(means.values)
print(f"Difference in means: {obs_means_diff}")

# Get results of two-sample t-test
tstat, pval = stats.ttest_ind(white_heights, black_heights, equal_var=False)
print(f"p-value from t-test: {pval}")

# Draw random permutations, calculate difference in means for each permutation
np.random.seed(11)
B = 10000
mean_diffs = []
for _ in range(B):
    heights_df["shuffled_race"] = np.random.choice(heights_df["race"], size=n_white+n_black, replace=False)
    shuffled_means = heights_df.groupby("shuffled_race")["height"].mean()
    mean_diffs.append(shuffled_means["black"] - shuffled_means["white"])


empirical_pval = np.mean(np.logical_or(mean_diffs <= -np.abs(obs_means_diff), mean_diffs >= np.abs(obs_means_diff)))
print(f"Permutation Test p value: {empirical_pval}")

Means from data
[71.76811616 70.13872506]
Difference in means: 1.6293911038630284
p-value from t-test: 0.07981920313858169
Permutation Test p value: 0.0626


### 1 Proportion Test

From [Data8 Ch 11](https://inferentialthinking.com/chapters/11/1/Assessing_a_Model.html). Suppose that of a jury of 100 people, claimed to be selected randomly, only 16 were black, whereas 26\% of the population in the area is black. Was the jury selection biased?

In [44]:
np.random.seed(10)
obs_prop = 0.16

# Get p value from one-sample proportions test
result = stats.binomtest(k=int(obs_prop*100), n=100, p=0.26)
print(f"p-value from one-sample proportions test: {result.pvalue}")

# Sample races according to null hypothesis distribution
races = ["black", "not black"]
probs = [0.26, 1 - 0.26]
B = 10000
sample_props = np.array([None]*B)
for i in range(B):
    sample = np.random.choice(races, p=probs, size=100)
    sample_props[i] = np.mean(sample == "black")
empirical_pval = np.mean(sample_props <= obs_prop)
print(f"p-value from sampling: {empirical_pval}")



p-value from one-sample proportions test: 0.022247315521301575
p-value from sampling: 0.0124


### 2 Proportion Test

A software company implements a new features and wants to test click through rate. 100 people are shown version A, with 62 clicking, and another 100 people are shown version B, with 74 clicking. Should the company go through with changing from layout A to layout B?

In [54]:
np.random.seed(10)
p_A = 0.62
p_B = 0.74
p_pooled = 0.5*(p_A + p_B)
obs_diff = p_B - p_A

# Get p value from one-sample proportions test
from statsmodels.stats.proportion import proportions_ztest
zstat, pval = proportions_ztest(count=[62, 74], nobs=[100, 100])
print(f"p-value from two-sample proportions test: {pval}")

# Sample races according to null hypothesis distribution
actions = ["clicked", "did not click"]
B = 10000
sample_diffs = np.array([None]*B)
for i in range(B):
    sample_1 = np.random.choice(actions, p=[p_pooled, 1-p_pooled], size=100)
    sample_2 = np.random.choice(actions, p=[p_pooled, 1-p_pooled], size=100)
    sample_diffs[i] = np.mean(sample_1 == "clicked") - np.mean(sample_2 == "clicked")

empirical_pval = np.mean(np.abs(sample_diffs) >= obs_diff)
print(f"p-value from sampling: {empirical_pval}")



p-value from two-sample proportions test: 0.06890880788626995
p-value from sampling: 0.0825
