In [None]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
from scipy import stats

In [None]:
def ANOVA(groups, alpha):
    group_mean = [np.mean(group) for group in groups]
    grand_mean = np.mean(group_mean)
    k = len(groups)
    N = sum([len(group) for group in groups])
    SSB = sum( len(group) * (group_mean[i] - grand_mean)**2 \
                for i, group in enumerate(groups) )
    SSW = sum( (x - group_mean[i])**2 \
                for i, group in enumerate(groups) for x in group )
    dfb = k - 1
    dfw = N - k
    MSB = SSB/dfb
    MSW = SSW/dfw
    F = MSB/MSW
    critical = stats.f.ppf(1-alpha, dfb, dfw)

    print("Critical Value =", critical)
    print("F Statistic =", F)
    if F > critical: 
        print("Reject Null Hypothesis")
    else:
        print("Accept Null Hypothesis")

    x = np.linspace(0, 6, 1000)
    y = stats.f.pdf(x, dfb, dfw)
    plt.figure(figsize = (10, 6))
    plt.plot(x, y, color = 'blue', label = 'F Distribution')
    plt.axvline(x = F, color = 'green', linestyle = '--', label = 'F Statistic')
    plt.fill_between(x, y, color = 'red', where = (x > critical), alpha = 0.5, label = 'Critical Region')

    plt.xlabel('F Score')
    plt.ylabel('Probability Density')
    plt.title('One Way ANOVA Test')
    plt.legend()
    plt.show()

<b>Implement Random Sampling, Demonstrate ANOVA. 
Is there a significant difference in the DiabetesPedigreeFunction levels between young adults (20-30), middle-aged adults (31-50), and older adults (50+) diagnosed with diabetes?
</b>

In [None]:
df = pd.read_csv('2_ANOVA.csv')
print(df.shape)
df.head()

In [None]:
def categorize(age):
    if 20 <= age <= 30: return "young"
    elif 30 < age <= 50: return "middle-aged"
    elif 50 < age: return "old"

df['Age_Category'] = df['Age'].apply(categorize)
df[['DiabetesPedigreeFunction', 'Age', 'Age_Category']].head(10)

In [None]:
groups = df.groupby('Age_Category')['DiabetesPedigreeFunction'].apply(list)
groups = [random.choices(group, k = 50) for group in groups]
alpha = 0.05
ANOVA(groups, alpha)