# 1-Way ANOVA by hand (from scratch)

In [1]:
import pandas as pd

In [2]:
A = [12.6, 12, 11.8, 11.9, 13, 12.5, 14]
B = [10, 10.2, 10, 12, 14, 13]
C = [10.1, 13, 13.4, 12.9, 8.9, 10.7, 13.6, 12]

In [3]:
all_scores = A + B + C
company_names = (['A'] * len(A)) +  (['B'] * len(B)) +  (['C'] * len(C))

In [4]:
data = pd.DataFrame({'company': company_names, 'score': all_scores})

In [5]:
data

Unnamed: 0,company,score
0,A,12.6
1,A,12.0
2,A,11.8
3,A,11.9
4,A,13.0
5,A,12.5
6,A,14.0
7,B,10.0
8,B,10.2
9,B,10.0


In [6]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

  import pandas.util.testing as tm


In [7]:
lm = ols('score ~ company',data=data).fit()
table = sm.stats.anova_lm(lm)
print(table)

            df     sum_sq   mean_sq         F    PR(>F)
company    2.0   3.606905  1.803452  0.821297  0.455683
Residual  18.0  39.525476  2.195860       NaN       NaN


In [8]:
# F = (ss_bw/ df_bw ) / (ss_within/ df_within)

In [9]:
# compute overall mean
overall_mean = data['score'].mean()
overall_mean # XbarG 

11.980952380952381

In [10]:
# compute Sum of Squares Total
data['overall_mean'] = overall_mean
ss_total = sum((data['score'] - data['overall_mean'])**2)
ss_total  #ss_between 

43.132380952380956

In [11]:
# compute group means
group_means = data.groupby('company').mean()
group_means = group_means.rename(columns = {'score': 'group_mean'})
group_means # Xbar 1 , Xbar 2 , Xbar 3

Unnamed: 0_level_0,group_mean,overall_mean
company,Unnamed: 1_level_1,Unnamed: 2_level_1
A,12.542857,11.980952
B,11.533333,11.980952
C,11.825,11.980952


In [12]:
# add group means and overall mean to the original data frame
data = data.merge(group_means, left_on = 'company', right_index = True)
data

Unnamed: 0,company,score,overall_mean_x,group_mean,overall_mean_y
0,A,12.6,11.980952,12.542857,11.980952
1,A,12.0,11.980952,12.542857,11.980952
2,A,11.8,11.980952,12.542857,11.980952
3,A,11.9,11.980952,12.542857,11.980952
4,A,13.0,11.980952,12.542857,11.980952
5,A,12.5,11.980952,12.542857,11.980952
6,A,14.0,11.980952,12.542857,11.980952
7,B,10.0,11.980952,11.533333,11.980952
8,B,10.2,11.980952,11.533333,11.980952
9,B,10.0,11.980952,11.533333,11.980952


In [13]:
# compute Sum of Squares Residual
ss_residual = sum((data['score'] - data['group_mean'])**2)
ss_residual # ss_within where Xi1 is score here

39.52547619047619

In [15]:
# compute Sum of Squares Model
ss_explained = sum((data['overall_mean_x'] - data['group_mean'])**2)
ss_explained # ss_between where overall mean is XbarG and group mean is Xbar1(mean of A),...Xbar3(mean of C)

3.6069047619047776

In [16]:
# compute Mean Square Residual
n_groups = len(set(data['company'])) # K
n_obs = data.shape[0] # N 
df_residual = n_obs - n_groups # df_within
ms_residual = ss_residual / df_residual # MS_within = ss_within / df_within
ms_residual

2.1958597883597886

In [17]:
 #compute Mean Square Explained
df_explained = n_groups - 1 # df_bw
ms_explained = ss_explained / df_explained # MS_bw = ss_bw / df_bw
ms_explained

1.8034523809523888

In [18]:
# compute F-Value
f = ms_explained / ms_residual  # F = MS_bw / MS_within  or  F = (ss_bw/ df_bw ) / (ss_within/ df_within)
f

0.8212966923081592

In [19]:
# compute p-value
import scipy.stats
p_value = 1 - scipy.stats.f.cdf(f, df_explained, df_residual)
p_value

0.4556832940515221