# Chi-Squared Testing For Association, Goodness of Fit, and Homogeneity

## Chi Square Tests for 
1) Association between catigorical variables
2) Goodness of fit between actual distribution and expected/theoretical distribution  
3) Homogeneity (sameness) of groups

In [10]:
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency, chisquare
np.random.seed(1663)

In [24]:
#Association Data - 
df = pd.DataFrame({
    'Gender': np.random.choice(['Male', 'Female'], size=100, p=[0.5, 0.5]),
    'Preference': np.random.choice(['Dogs', 'Cats'], size=100, p=[0.4, 0.6])
})
table = pd.crosstab(df['Gender'], df['Preference'])
table

Preference,Cats,Dogs
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,32,15
Male,30,23


In [26]:
#Chi-Squared Test of Association / Independence
#Null Hypothesis = Variables are independent of one another
#P-value must be below alpha to reject null
print(chi2_contingency(table))

Chi2ContingencyResult(statistic=0.949019185536522, pvalue=0.3299690808943076, dof=1, expected_freq=array([[29.14, 17.86],
       [32.86, 20.14]]))


In [28]:
#Goodness of fit data - Dice rolls - Do actual outcomes resemble predicted outcomes / distribution (uniform)
rolls = np.random.choice([1, 2, 3, 4, 5, 6], size=100, p=[1/6]*6)
observed_counts = np.bincount(rolls)[1:]  
expected_counts = np.full(6, 100/6) 

print(observed_counts)
print(expected_counts)

[23 19 20 12  8 18]
[16.66666667 16.66666667 16.66666667 16.66666667 16.66666667 16.66666667]


In [30]:
#Chi-Squared Test for Goodness of Fit
#Null hypothesis - Observed data fits actual distribution
#P-value must be below alpha to reject null and conclude dice are unfair
chisquare(observed_counts, expected_counts)

Power_divergenceResult(statistic=9.32, pvalue=0.09696125366335304)

In [34]:
#Homogeneity data - Favorite fruit by gender
Male = np.random.choice(['Apples', 'Bananas', 'Cherries'], size=50, p=[0.3, 0.5, 0.2])
Female = np.random.choice(['Apples', 'Bananas', 'Cherries'], size=50, p=[0.3, 0.4, 0.3])
homogeneity_data = pd.DataFrame({
    'Group': ['Male']*50 + ['Female']*50,
    'Response': np.concatenate([Male, Female])
})
table = pd.crosstab(homogeneity_data['Group'], homogeneity_data['Response'])
table

Response,Apples,Bananas,Cherries
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,17,16,17
Male,19,22,9


In [36]:
#Chi-Squared Test for Homogeneity 
#Null hypothesis - Groups are homogenous
#P-value must be less than alpha to reject null and conclude that there is a significant difference between groups
chi2_contingency(table)

Chi2ContingencyResult(statistic=3.5200179937022043, pvalue=0.1720433159679907, dof=2, expected_freq=array([[18., 19., 13.],
       [18., 19., 13.]]))

In [38]:
#Critical Values
import scipy.stats as stats
critical_value = stats.chi2.ppf(0.95, 5)  #95% confidence level, DoF=5
critical_value

11.070497693516351