### Here we perform hypothesis testing using 1-way ANOVA with statsmodel library

- Generate synthetic dataset
- Find variance between different groups
- Find variance within different groups
- Find f statistic
- Find p-value
- Hypothesis test conclusion

In [23]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
import scipy.stats

In [24]:
# Dataset: Car part durations in months when bought from different vendors
v1 = [5, 8, 7, 3, 2]
v2 = [12, 8, 10, 10, 15]
v3 = [7, 8, 5, 4, 6]

all_durations = v1 + v2 + v3
vendor_ids = (['v1'] * len(v1)) +  (['v2'] * len(v2)) +  (['v3'] * len(v3))
data = pd.DataFrame({'vendor': vendor_ids, 'duration': all_durations})
data

Unnamed: 0,vendor,duration
0,v1,5
1,v1,8
2,v1,7
3,v1,3
4,v1,2
5,v2,12
6,v2,8
7,v2,10
8,v2,10
9,v2,15


In [25]:
data.groupby('vendor').mean()

Unnamed: 0_level_0,duration
vendor,Unnamed: 1_level_1
v1,5.0
v2,11.0
v3,6.0


In [26]:
lm = ols('duration ~ vendor',data=data).fit()
table = sm.stats.anova_lm(lm)
print(table)

            df      sum_sq    mean_sq       F   PR(>F)
vendor     2.0  103.333333  51.666667  9.6875  0.00313
Residual  12.0   64.000000   5.333333     NaN      NaN


In [27]:
# Variance between groups
ss_explained = table['sum_sq']['vendor']
df_explained = table['df']['vendor']
ms_explained = ss_explained / df_explained

# Variance within groups aka residual sum of squares
ss_residual = table['sum_sq']['Residual']
df_residual = table['df']['Residual']
ms_residual = ss_residual / df_residual

# f_stat also mentioned in 'F' column of table
f_stat = ms_explained / ms_residual

print(f'ms_explained: {ms_explained}\nms_residual: {ms_residual}\nf_stat: {f_stat}')

ms_explained: 51.666666666666615
ms_residual: 5.333333333333333
f_stat: 9.687499999999991


### Hypothesis testing:
- Null Hypothesis H0: Mean car part duration from all vendors is equal and the difference we may have found is by random chance
- Alternate Hypothesis H1: Mean car part duration is different for at least 2 vendors 

In [31]:
# compute p-value for hypothesis testing
p_value = table['PR(>F)']['vendor']
p_value

0.003130293571603806

- p-value we got is 0.003 which is lower than expected 0.05 threshold, we reject the null hypothesis
- That implies mean duration for vendor 2 is higher than other vendors by statistical significance and is not by random chance