In [1]:
from scipy import stats
import numpy as np

### Hypothesis testing with one sample

#### mean - known population standard deviation - use normal

In [2]:
# Left and right tail tests for the mean 
# When the population standard deviation is somehow known, p-values found using a normal distribution

h = 15  # null hypothesis, mu <= h use right tail test, 
        #                  mu >= h use left tail test

n = 10  # number of samples 
x_bar = 15.2  # Observed average
sigma = 0.5  # Population standard deviation

pval = stats.norm.cdf(x_bar, h, (sigma / np.sqrt(n))) # Left tail test
pval_lefttail1 = 1 - stats.norm.cdf(x_bar, h, (sigma / np.sqrt(n))) # Right tail test
pval_lefttail2 = stats.norm.sf(x_bar, h, (sigma / np.sqrt(n))) # More accurate right tail test

print(pval) # if hypothesis were >=
print(pval_lefttail1) # if hypothesis were >=
print(pval_lefttail2)

0.897048394633965
0.10295160536603498
0.10295160536603498


#### proportion - treat as binomial modeled as normal

In [3]:
# proportion, two-tailed

h = 0.50  # hypothesis proportion

n = 100   # number of samples
p = 0.53  # proportion observed


sigma = np.sqrt(h * (1 - h) / n)  # for proportion from binomial

# explainatory path to p-value 
if h < p:
    right_tail = stats.norm.sf(p, h, sigma)
    left_tail = stats.norm.cdf(h + (h - p), h, sigma)
else:
    right_tail = stats.norm.sf(h + (h - p), h, sigma)
    left_tail = stats.norm.cdf(p, h, sigma)
pval = right_tail + left_tail
print(pval)

# one line to p-value       
pval = 2 * stats.norm.sf(h + np.abs(h - p), h, sigma)
    
print(pval)

0.5485062355001469
0.5485062355001469


#### unknown population standard deviation - use t-test

In [4]:
data = [1.11, 1.07, 1.11, 1.07, 1.12, 1.08, 0.98, 0.98, 1.02, 0.95, 0.95]
h = 1.00 # null hypothesis: mean <= 1

n = len(data)
x_bar = np.mean(data)
s = np.std(data, ddof=1) # sample std so ddof = 1
print(x_bar)

pval = stats.t.sf(x_bar, n - 1, h, s / np.sqrt(n)) 

print(pval)

# Or just use scipy method
#  Note: returns the test statistic, and 2 sided p-value 
print(stats.ttest_1samp(data, h)) 

# The one-tail that's interesting is half the two-tailed
# * i.e. hypothesis mean <= 1 will never be rejected by a mean <= 1
print(stats.ttest_1samp(data, h)[1] / 2) # 

# If you really want the one tail for the side that will never reject the null  
print(1 - (stats.ttest_1samp(data, h)[1]) / 2) 


1.0399999999999998
0.03586064860607561
Ttest_1sampResult(statistic=2.0137774303955394, pvalue=0.07172129721215122)
0.03586064860607561
0.9641393513939244


### Hypothesis testing with two independent samples

In [11]:
# Two populations

n_x = 9      # number of x
x_bar = 2    # average for x
s_x = 0.866  # sample sigma for x

n_y = 16     # number of y
y_bar = 3.2  # average for y
s_y = 1      # sample sigma for y

# DOF - Welch Test
dof = ((s_x ** 2 / n_x + s_y ** 2 / n_y) ** 2 / 
      (1 / (n_x - 1) * (s_x ** 2 / n_x) ** 2 + 
       1 / (n_y - 1) * (s_y ** 2 / n_y) ** 2))
print(dof)

# Standard Error
std_err = np.sqrt((s_x ** 2 / n_x) + (s_y ** 2 / n_y))

pval = (stats.t.sf(np.abs(x_bar - y_bar), dof, 0, std_err) +
        stats.t.cdf(-1 * np.abs(x_bar - y_bar), dof, 0, std_err))

print(pval)

# If the data is available scipy.stats.ttest_ind(a, b, equal_var=False) can be used
# Note: if equal_var: perform a student's independent 2 sample test 
#       (assumes equal population sizes and variances) 
#       if not equal_var: perform Welch’s t-test

18.84659125336577
0.005401921297382211


#### right tailed dist + Cohen's d

In [17]:
# h0 mean of X less than mean of Y # right tailed distribution   

n_x = 11
x_bar = 4
s_x = 1.5

n_y = 9
y_bar = 3.5
s_y = 1

# DOF - Welch Test
dof = ((s_x ** 2 / n_x + s_y ** 2 / n_y) ** 2 / 
      (1 / (n_x - 1) * (s_x ** 2 / n_x) ** 2 + 
       1 / (n_y - 1) * (s_y ** 2 / n_y) ** 2))

std_err = np.sqrt(s_x ** 2 / n_x + s_y ** 2 / n_y)

pval = (stats.t.sf(x_bar - y_bar, dof, 0, std_err))

print(pval)

# cohen's d

s_pooled = np.sqrt((((n_x - 1) * s_x ** 2) +
                    ((n_y - 1) * s_y ** 2)) / 
                   (n_x + n_y - 2))

cohen_d = (x_bar - y_bar) / s_pooled

print(cohen_d)


0.1928185434187067
0.3841106397986879


#### Two Independent Population Proportions

In [39]:
x_a = 20
n_a = 200
x_b = 12
n_b = 200

p_a = (x_a / n_a)
p_b = (x_b / n_b)
p_c = (x_a + x_b) / (n_a + n_b)  # pooled proportion

sigma = np.sqrt(p_c * (1 - p_c) * (1 / n_a + 1 / n_b))

if p_a < p_b:
    p_a, p_b = p_b, p_a
p_val = (stats.norm.sf((p_a - p_b), 0, sigma) +
         stats.norm.cdf(0 - (p_a - p_b), 0, sigma))

print(p_val)

# not sure you can one line this with scipy, but probably with stats models


0.1403686607716731
