In [45]:
import numpy as np
from scipy.stats import f, chi2_contingency, chi2

import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols


# ANOVA: 1 - way

In [11]:
n_groups = 3
group1 = np.random.normal(loc=60, scale=10, size=30)  # Group 1 scores
group2 = np.random.normal(loc=70, scale=10, size=30)  # Group 2 scores
group3 = np.random.normal(loc=65, scale=10, size=30)  # Group 3 scores

In [12]:
SSW_g1 = np.sum((np.mean(group1) - group1)**2)
SSW_g2 = np.sum((np.mean(group2) - group2)**2)
SSW_g3 = np.sum((np.mean(group3) - group3)**2)

SSW = SSW_g1 + SSW_g2 + SSW_g3

In [13]:
mean_all_groups = (np.mean(group1) + np.mean(group2) + np.mean(group3))/3
SSB = len(group1)*(np.mean(group1) - mean_all_groups)**2 + len(group2)*(np.mean(group2) - mean_all_groups)**2 + len(group3)*(np.mean(group3) - mean_all_groups)**2


In [22]:
n_groups = 3
dof_between = n_groups - 1
MSB = SSB/dof_between

In [23]:
n_total = len(group1) + len(group2) + len(group3)
dof_within = n_total - n_groups
MSW = SSW/dof_within

In [28]:
F_stat = MSB/MSW
print(F_stat)

5.801765858038019


In [25]:
p_value = f.sf(F_stat, dof_between, dof_within)
print(p_value)

0.004312856641228354


In [27]:
# Confirm with existing library
data = pd.DataFrame({
    'Score': np.concatenate([group1, group2, group3]),
    'Group': ['Group 1']*30 + ['Group 2']*30 + ['Group 3']*30
})


model = ols('Score ~ Group', data=data).fit()

anova_results = sm.stats.anova_lm(model, typ=1)  # Type I SS

# Display the ANOVA table
print(anova_results)


            df        sum_sq     mean_sq         F    PR(>F)
Group      2.0   1461.365846  730.682923  5.801766  0.004313
Residual  87.0  10956.907924  125.941470       NaN       NaN


# Chi-Square Test of Independence

In [70]:
data = np.random.randint(10, 100, size=(2, 2))

data = data.astype(float)
data

array([[83., 69.],
       [13., 24.]])

In [71]:
# expected if independent at P(R, C) = P(R) * P(C) * all_sum = row_total/all_sum * col_total/all_sum * all_sum = row_total*col_total/all_sum
expected_if_indep = np.zeros((2,2))
for i in range(2):
    for j in range(2):
        expected_if_indep[i,j] = (np.sum(data[i,:]) * np.sum(data[:,j]) )/np.sum(data)

In [72]:
expected_if_indep

array([[77.20634921, 74.79365079],
       [18.79365079, 18.20634921]])

In [73]:
# To calculate the chi sq statistic, see the diff btn obv and independent data, normalized by independent data
diff_sq = (data - expected_if_indep)**2
diff_sq_norm = diff_sq/expected_if_indep

In [74]:
chi_sq_stat = np.sum(diff_sq_norm)
chi_sq_stat

4.513262035114488

In [75]:
# get the p value from chi sq statistic
chi2.sf(chi_sq_stat, (2-1)*(2-1))

0.03363303943575132

In [85]:
# test using lib. 
contingency_table = pd.DataFrame(data, columns=['Option A', 'Option B'])

# NOTE: correction param is True by default, if dof = 1. Adds Yates' correction of +/- 0.5 where the difference less is less than 0.5
chi2_, p, dof, expected = chi2_contingency(contingency_table, correction=False)

print("Chi-square Statistic:", chi2_)
print("Degrees of Freedom:", dof)
print("Expected Frequencies:\n", expected)
print("P-value:", p)


Chi-square Statistic: 4.513262035114488
Degrees of Freedom: 1
Expected Frequencies:
 [[77.20634921 74.79365079]
 [18.79365079 18.20634921]]
P-value: 0.03363303943575132


## Purpose of Yates' Correction
Yates' correction is used to prevent an overestimation of statistical significance for small datasets in a 2x2 contingency table. The correction adjusts the observed frequencies slightly towards expected frequencies(atmost 0.5), effectively adding a continuity correction that reduces the Chi-square value. This is particularly useful in reducing Type I error (false positives).