In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import scipy.stats as stats
%matplotlib inline

### One way Anova

**Assumptions for one way Anova:** 

* The populations are normally distributed
* Samples are independent simple random samples
* The variances between groups should be similar.

**Example 1:** A restaurant chain manager wants to compare customer satisfaction ratings across four different branches in the city.

In [2]:
#Data
branch_A = [7.2, 7.5, 6.8, 8.0, 7.9, 7.3, 7.1, 6.9, 7.6, 7.4]
branch_B = [6.8, 7.0, 7.2, 7.5, 6.9, 7.3, 6.7, 6.5, 7.1, 6.6]
branch_C = [8.2, 8.1, 8.5, 8.3, 8.0, 7.9, 8.2, 8.4, 8.3, 8.5]
branch_D = [7.0, 7.2, 6.8, 7.1, 7.4, 6.9, 7.0, 6.8, 7.1, 6.7]

Testing the null hypothesis

>$H_0:\mu_1=\mu_2=\mu_3$

against the alternate hypothesis

>$H_1:$ At least one branch is different

In [3]:
#checking the assumptions
#Normality
norm_A=stats.shapiro(branch_A)
norm_B=stats.shapiro(branch_B)
norm_C=stats.shapiro(branch_C)
norm_D=stats.shapiro(branch_D)
print(norm_A)
print(norm_B)
print(norm_C)
print(norm_D)

ShapiroResult(statistic=0.9695002436637878, pvalue=0.8861736059188843)
ShapiroResult(statistic=0.9808780550956726, pvalue=0.9696930646896362)
ShapiroResult(statistic=0.9540717601776123, pvalue=0.7167389392852783)
ShapiroResult(statistic=0.9673692584037781, pvalue=0.8654400110244751)


Since the p values of all the branches is greater than 0.5 so we fail to reject the null hypothesis hence we conclude that data follows the normal distribution

In [4]:
#checking for homogenity of variances
levene_test=stats.levene(branch_A, branch_B, branch_C, branch_D)
levene_test

LeveneResult(statistic=2.1405919661733583, pvalue=0.11213898260203213)

since the p value is greater than the 0.5 so it satisfies the assumption of homogenity of variances

In [5]:
#checking the hypothesis using p value
f_stat,p_value=stats.f_oneway(branch_A, branch_B, branch_C, branch_D)
p_value

9.98794356190338e-12

As the p-value is much less than the significance level, we can reject the null hypothesis. Hence, we do have enough statistical significance to conclude that at least one branch is different from the rest at 5% significance level.

In [32]:
#checking the hypothesis using critical value
alpha=0.05
df_between = 3  # 4 branches - 1
df_within = len(branch_A) + len(branch_B) + len(branch_C) + len(branch_D) - 4
critical_value = stats.f.ppf(1 - alpha, df_between, df_within)
critical_value

2.866265550940178

In [7]:
f_stat

41.17351893816766

since f stat is greater than the critical value we reject the null hypothesis

**Example 2:** A company wants to determine if sales volume differs among various sales regions (north, south, east, west) for a specific product.

In [8]:
# Data
region_North =[510, 520, 500, 495, 510, 505, 498, 520, 515, 500]
region_South =[480, 470, 475, 465, 490, 485, 480, 475, 470, 460]
region_East = [515, 510, 520, 505, 500, 510, 515, 520, 510, 505]
region_West = [490, 495, 500, 485, 490, 485, 480, 475, 490, 485]

Testing the null hypothesis

>$H_0:\mu_1=\mu_2=\mu_3=\mu_4$

against the alternate hypothesis

>$H_1:$ Atleast one region is different

In [9]:
#checking the assumptions
#normality
norm_north=stats.shapiro(region_North)
norm_south=stats.shapiro(region_South)
norm_east=stats.shapiro(region_East)
norm_west=stats.shapiro(region_West)
print(norm_north)
print(norm_south)
print(norm_east)
print(norm_west)

ShapiroResult(statistic=0.921709418296814, pvalue=0.371433287858963)
ShapiroResult(statistic=0.9837162494659424, pvalue=0.9819299578666687)
ShapiroResult(statistic=0.941914439201355, pvalue=0.5745060443878174)
ShapiroResult(statistic=0.968424379825592, pvalue=0.875864565372467)


Since the p values of all the regions is greater than 0.5 so we fail to reject the null hypothesis hence we conclude that data follows the normal distribution

In [10]:
#checking for homogenity of variances
levene_test=stats.levene(region_North,region_South,region_East,region_West)
levene_test

LeveneResult(statistic=0.7927031509121063, pvalue=0.5059804441707614)

since the p value is greater than the 0.5 so it satisfies the assumption of homogenity of variances

In [11]:
#checking the hypothesis using p value
f_stat,p_value=stats.f_oneway(region_North,region_South,region_East,region_West)
p_value

3.817833379579858e-12

since p value is less than alpha we reject the null hypothesis and hence we conclude that we do have enough statistical evidence that atleast one region is different from the others


In [12]:
#checking the hypothesis using critical value
alpha=0.05
df_between=3
#total no of observations - no of groups
df_within=len(region_North)+len(region_South)+len(region_East)+len(region_West)-4
critical_value=stats.f.ppf(1-alpha,df_between,df_within)
critical_value


2.866265550940178

In [13]:
f_stat

44.11376990547559

since f stat is greater than the critical value we reject the null hypothesis

**Example 3:** A teacher wants to check if different study methods (self-study, group study, tutoring, online course) impact student test scores in an exam.

In [14]:
# Data
self_study = [65, 70, 68, 72, 66, 71, 69, 68, 70, 67]
group_study =[75, 78, 72, 76, 77, 74, 75, 73, 76, 75]
tutoring = [85, 82, 87, 84, 86, 83, 85, 86, 84, 82]
online_course = [78, 79, 77, 76, 78, 80, 79, 78, 77, 78]

Testing the null hypothesis

>$H_0:\mu_1=\mu_2=\mu_3=\mu_4$

against the alternate hypothesis

>$H_1:$ Atleast one method is different

In [15]:
#checking the assumptions
#normality
norm_self=stats.shapiro(self_study)
norm_group=stats.shapiro(group_study)
norm_tutoring=stats.shapiro(tutoring)
norm_online=stats.shapiro(online_course)
print(norm_self)
print(norm_group)
print(norm_tutoring)
print(norm_online)

ShapiroResult(statistic=0.9779828190803528, pvalue=0.9534648656845093)
ShapiroResult(statistic=0.9755175113677979, pvalue=0.9367681741714478)
ShapiroResult(statistic=0.9433293342590332, pvalue=0.5906188488006592)
ShapiroResult(statistic=0.9528768062591553, pvalue=0.7025895118713379)


Since the p values of all the methods is greater than 0.5 so we fail to reject the null hypothesis hence we conclude that data follows the normal distribution

In [16]:
#checking for homogenity of variances
levene_test=stats.levene(self_study,group_study,tutoring,online_course)
levene_test

LeveneResult(statistic=1.659400544959128, pvalue=0.19298856149806995)

since p value is greater than 0.5 we can conlcude that it satisfies the assumption of homogenity of variances

In [17]:
#checking the hypothesis using p value
f_stat,p_value=stats.f_oneway(self_study,group_study,tutoring,online_course)
p_value

7.862567981806783e-20

since p value is less than alpha we reject the null hypothesis and hence we conclude that we do have enough statistical evidence that atleast one method is different from the others

In [18]:
#checking the hypothesis using critical value
alpha=0.05
df_between=3
df_within=len(self_study)+len(group_study)+len(tutoring)+len(online_course)-4
critical_value=stats.f.ppf(1-alpha,df_between,df_within)
critical_value

2.866265550940178

In [19]:
f_stat

138.6150402864816

since f stat is greater than the critical value we reject the null hypothesis

**Example 4:** An agricultural scientist investigates whether three different fertilizers (A, B, C) have different effects on plant growth.

In [20]:
#Data
fertilizer_A = [25, 28, 26, 27, 29, 30, 25, 28, 27, 26]
fertilizer_B = [35, 32, 34, 36, 33, 31, 33, 34, 35, 32]
fertilizer_C = [40, 42, 41, 39, 40, 38, 41, 39, 40, 41]

Testing the null hypothesis

>$H_0:\mu_1=\mu_2=\mu_3=\mu_4$

against the alternate hypothesis

>$H_1$: At least one fertilizer type is different

In [21]:
#checking the assumptions
#normality
norm_a=stats.shapiro(fertilizer_A)
norm_b=stats.shapiro(fertilizer_B)
norm_c=stats.shapiro(fertilizer_C)
print(norm_a)
print(norm_b)
print(norm_c)

ShapiroResult(statistic=0.9480838179588318, pvalue=0.6458873152732849)
ShapiroResult(statistic=0.9657219052314758, pvalue=0.8485973477363586)
ShapiroResult(statistic=0.9519404172897339, pvalue=0.6914891600608826)


Since the p values of all the fertilizers is greater than 0.5 so we fail to reject the null hypothesis hence we conclude that data follows the normal distribution

In [22]:
#checking the homogenity of variances
levene_test=stats.levene(fertilizer_A,fertilizer_B,fertilizer_C)
levene_test

LeveneResult(statistic=0.7741935483870969, pvalue=0.47104188740573744)

since p value is greater than 0.5 we can conlcude that it satisfies the assumption of homogenity of variances

In [23]:
#checking the hypothesis using p value
f_stat,p_value=stats.f_oneway(fertilizer_A,fertilizer_B,fertilizer_C)
p_value

1.3096430809752792e-16

since p value is less than alpha we reject the null hypothesis and hence we conclude that we do have enough statistical evidence that atleast one fertilizer is different from the others

In [24]:
#checking the hypothesis using crtical value
alpha=0.05
df_between=2
df_within=len(fertilizer_A)+len(fertilizer_B)+len(fertilizer_C)-3
critical_value=stats.f.ppf(1-alpha,df_between,df_within)
critical_value

3.3541308285291986

In [25]:
f_stat

189.19402985074603

since f_stat is greater than critical value we reject the null hypothesis

**Example 5:** An HR manager wants to test if there are productivity differences between employees in different departments (sales, marketing, IT, finance).

In [26]:
# Data
hr = [50, 55, 60, 58, 53, 52, 54, 51, 56, 59]
it = [70, 75, 78, 72, 76, 73, 74, 71, 77, 75]
marketing = [65, 67, 69, 66, 68, 64, 62, 63, 66, 65]
sales = [85, 88, 90, 92, 89, 87, 88, 91, 90, 89]

Testing the null hypothesis

>$H_0:\mu_1=\mu_2=\mu_3=\mu_4$

against the alternate hypothesis

>$H_1:$ Atleast one department is different

In [27]:
#checking the assumptions
#normality
norm_hr=stats.shapiro(hr)
norm_it=stats.shapiro(it)
norm_market=stats.shapiro(marketing)
norm_sales=stats.shapiro(sales)
print(norm_hr)
print(norm_it)
print(norm_market)
print(norm_sales)

ShapiroResult(statistic=0.9590273499488831, pvalue=0.7747271656990051)
ShapiroResult(statistic=0.9752339720726013, pvalue=0.9346861243247986)
ShapiroResult(statistic=0.9839820861816406, pvalue=0.9828903079032898)
ShapiroResult(statistic=0.9776149988174438, pvalue=0.951137900352478)


Since the p values of all the fertilizers is greater than 0.05 so we fail to reject the null hypothesis hence we conclude that data follows the normal distribution

In [28]:
#checking the homogenity of variances
levene_test=stats.levene(hr,it,marketing,sales)
levene_test

LeveneResult(statistic=1.5884718498659514, pvalue=0.2090884487096653)

since p value is greater than 0.05 we can conlcude that it satisfies the assumption of homogenity of variances

In [29]:
#checking the hypothesis p value
f_stat,p_value=stats.f_oneway(it,hr,marketing,sales)
p_value

1.3069119759518921e-25

since p value is less than alpha we reject the null hypothesis and hence we conclude that we do have enough statistical evidence that atleast one department is different from the others

In [30]:
#checking the hypothesis critical value
alpha=0.05
df_between=3
df_within=len(it)+len(hr)+len(marketing)+len(sales)
critical_value=stats.f.ppf(1-alpha,df_between,df_within)
critical_value

2.8387453980206443

In [31]:
f_stat

303.82472549817044

since f_stat is greater than critical value we reject the null hypothesis

**Example 6:** A school wants to test if three different teaching methods (Traditional, Online, Blended) have a significant effect on student performance in a final exam.

In [34]:
# Data
traditional = [65, 70, 75, 68, 64, 66, 72]
online = [80, 82, 85, 83, 79, 84, 81]
blended = [75, 78, 80, 76, 74, 79, 77]

Testing the null hypothesis

>$H_0:\mu_1=\mu_2=\mu_3$

against the alternate hypothesis

>$H_1:$ Atleast one teaching method is different

In [36]:
#checking the assumptions
#normality
norm_traditional=stats.shapiro(traditional)
norm_online=stats.shapiro(online)
norm_blended=stats.shapiro(blended)
print(norm_traditional)
print(norm_online)
print(norm_blended)

ShapiroResult(statistic=0.9508326649665833, pvalue=0.7372733354568481)
ShapiroResult(statistic=0.9780016541481018, pvalue=0.9492886662483215)
ShapiroResult(statistic=0.9780016541481018, pvalue=0.9492886662483215)


Since the p values of all the fertilizers is greater than 0.5 so we fail to reject the null hypothesis hence we conclude that data follows the normal distribution

In [38]:
#checking for the homogenity of variances
levene_test=stats.levene(traditional,online,blended)
levene_test

LeveneResult(statistic=1.9607843137254901, pvalue=0.16967110914936154)

since p value is greater than 0.05 we can conlcude that it satisfies the assumption of homogenity of variances

In [39]:
#checking the hypothesis using p value
f_stat,p_value=stats.f_oneway(traditional,online,blended)
p_value

3.2977350858444203e-07

since p value is less than alpha we reject the null hypothesis and hence we conclude that we do have enough statistical evidence that atleast one teaching method is different from the others

In [40]:
#checking the hypothesis using critical value
alpha=0.05
df_between=2
df_within=len(traditional)+len(online)+len(blended)-3
critical_value=stats.f.ppf(1-alpha,df_between,df_within)
critical_value

3.554557145661787

In [41]:
f_stat

38.25423728813558

since f stat is greater than the critical value we reject the null hypothesis

**Example 7:** A market research company wants to compare coffee consumption preferences (e.g., Black, Espresso, Latte) among three different age groups: 18-30, 31-50, and 51+.

In [43]:
# Data
age_18_30 = [30, 32, 35, 34, 31]
age_31_50 = [50, 52, 55, 51, 53]
age_51_plus =[45, 44, 46, 47, 45]

Testing the null hypothesis

>$H_0:\mu_1=\mu_2=\mu_3$

against the alternate hypothesis

>$H_1:$ Atleast one age group is different.

In [45]:
#checking the assumptions
#normality
norm18_30=stats.shapiro(age_18_30)
norm31_50=stats.shapiro(age_31_50)
norm51_plus=stats.shapiro(age_51_plus)
print(norm18_30)
print(norm31_50)
print(norm51_plus)

ShapiroResult(statistic=0.9523513913154602, pvalue=0.753972589969635)
ShapiroResult(statistic=0.9787160754203796, pvalue=0.9276362061500549)
ShapiroResult(statistic=0.9608590006828308, pvalue=0.8139519691467285)


Since the p values of all the fertilizers is greater than 0.5 so we fail to reject the null hypothesis hence we conclude that data follows the normal distribution

In [46]:
#checking the homogenity of variances
levene_test=stats.levene(age_18_30,age_31_50,age_51_plus)
levene_test


LeveneResult(statistic=0.7878787878787876, pvalue=0.4769831436572204)

since p value is greater than 0.05 we can conlcude that it satisfies the assumption of homogenity of variances

In [47]:
#checking the hypothesis using p value
f_stat,p_value=stats.f_oneway(norm18_30,norm31_50,norm51_plus)
p_value

0.6554672203185187

since p value is greater than alpha we reject the null hypothesis and hence we conclude that we don't have enough statistical evidence that atleast one age group is different from the others

In [48]:
#checking the hypothesis critical value
alpha=0.05
df_between=2
df_within=len(age_18_30)+len(age_31_50)+len(age_51_plus)-3
critical_value=stats.f.ppf(1-alpha,df_between,df_within)
critical_value

3.8852938346523946

In [49]:
f_stat

0.4878820276037316

since f_stat is less than the critical value we fail to reject the null hypothesis

**Example 8:** A clinical trial is testing three types of diets (Low-fat, Mediterranean, Keto) to see if they have different effects on cholesterol levels.

In [50]:
# Data
low_fat = [190, 195, 200, 180, 185]
mediterranean = [170, 175, 165, 180, 185]
keto = [160, 155, 150, 145, 160]

testing the null hypothesis$

>$H_0:\mu_1=\mu_2=\mu_3$

against the alternate hypothesis

>$H_1:$ Atleast one cholesterol level is different

In [51]:
# Assumptions check
#normality
norm_low_fat=stats.shapiro(low_fat)
norm_mediterranean=stats.shapiro(mediterranean)
norm_keto= stats.shapiro(keto)
print(norm_low_fat)
print(norm_mediterranean)
print(norm_keto)

ShapiroResult(statistic=0.9867621660232544, pvalue=0.9671739339828491)
ShapiroResult(statistic=0.9867621660232544, pvalue=0.9671739339828491)
ShapiroResult(statistic=0.9020196199417114, pvalue=0.4211485683917999)


Since the p values of all the fertilizers is greater than 0.5 so we fail to reject the null hypothesis hence we conclude that data follows the normal distribution

In [52]:
#checking homogenity of variances
levene_test=stats.levene(low_fat, mediterranean, keto)
levene_test

LeveneResult(statistic=0.10526315789473684, pvalue=0.9009095127015454)

since p value is greater than 0.05 we can conlcude that it satisfies the assumption of homogenity of variances

In [53]:
#checking the hypothesis using p value
f_stat, p_value = stats.f_oneway(low_fat, mediterranean, keto)
p_value

2.4180845670048005e-05

since p value is less than alpha we reject the null hypothesis and hence we conclude that we do have enough statistical evidence that atleast one cholesterol level is different from the others

In [54]:
#checking the hypothesis using critical value
alpha = 0.05
df_between = 2
df_within = len(low_fat) + len(mediterranean) + len(keto) - 3
critical_value = stats.f.ppf(1 - alpha, df_between, df_within)
critical_value

3.8852938346523946

In [55]:
f_stat

29.283582089552237

since f stat is greater than the critical value we reject the null hypothesis