### Here we perform hypothesis testing using 1-way ANOVA from scratch

- Generate synthetic dataset
- Find variance between different groups
- Find variance within different groups
- Find f statistic
- Find p-value
- Hypothesis test conclusion

In [1]:
import pandas as pd
import scipy.stats

In [17]:
# Dataset: Car part durations in months when bought from different vendors
v1 = [5, 8, 7, 3, 2]
v2 = [12, 8, 10, 10, 15]
v3 = [7, 8, 5, 4, 6]

all_durations = v1 + v2 + v3
vendor_ids = (['v1'] * len(v1)) +  (['v2'] * len(v2)) +  (['v3'] * len(v3))
data = pd.DataFrame({'vendor': vendor_ids, 'duration': all_durations})
data

Unnamed: 0,vendor,duration
0,v1,5
1,v1,8
2,v1,7
3,v1,3
4,v1,2
5,v2,12
6,v2,8
7,v2,10
8,v2,10
9,v2,15


In [18]:
# compute overall mean
overall_mean = data['duration'].mean()
print(overall_mean)

# compute Sum of Squares Total
data['overall_mean'] = overall_mean
ss_total = sum((data['duration'] - data['overall_mean'])**2)
print(ss_total)

# compute group means
group_means = data.groupby('vendor').mean()
group_means = group_means.rename(columns = {'duration': 'group_mean'})
print(group_means)

7.333333333333333
167.33333333333334
        group_mean  overall_mean
vendor                          
v1             5.0      7.333333
v2            11.0      7.333333
v3             6.0      7.333333


In [20]:
# add group means and overall mean to the original data frame
group_means.drop(columns=['overall_mean'], inplace=True)
data = data.merge(group_means, left_on = 'vendor', right_index = True)
# compute Sum of Squares Residual
ss_residual = sum((data['duration'] - data['group_mean'])**2)
print('ss_residual', ss_residual)

# # compute Sum of Squares Model
ss_explained = sum((data['overall_mean'] - data['group_mean'])**2)
print('ss_explained', ss_explained)

# compute Mean Square Residual
n_groups = len(set(data['vendor']))
n_obs = data.shape[0]
df_residual = n_obs - n_groups
ms_residual = ss_residual / df_residual
print('ms_residual', ms_residual)

# compute Mean Square Explained
df_explained = n_groups - 1
ms_explained = ss_explained / df_explained
print('ms_explained', ms_explained)

# compute F-Value
f_stat = ms_explained / ms_residual
print('f_stat', f_stat)

ss_residual 64.0
ss_explained 103.33333333333329
ms_residual 5.333333333333333
ms_explained 51.66666666666664
f_stat 9.687499999999996


In [22]:
# compute p-value
p_value = 1 - scipy.stats.f.cdf(f_stat, df_explained, df_residual)
p_value

0.0031302935716037705

- p-value we got is 0.003 which is lower than expected 0.05 threshold, we reject the null hypothesis
- That implies mean duration for vendor 2 is higher than other vendors by statistical significance and is not by random chance