Chi Square Tests for 1) Association between catigorical variables, 2) Goodness of fit between actual distribution and expected/theoretical distribution, and 3) Homogeneity (sameness) of groups

In [96]:
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency, chisquare
np.random.seed(1663)

In [98]:
#Association Data - 
df = pd.DataFrame({
    'Gender': np.random.choice(['Male', 'Female'], size=100, p=[0.5, 0.5]),
    'Preference': np.random.choice(['Dogs', 'Cats'], size=100, p=[0.4, 0.6])
})
table = pd.crosstab(df['Gender'], df['Preference'])
table

Preference,Cats,Dogs
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,34,21
Male,26,19


In [100]:
#Chi-Squared Test of Independence
#Null Hypothesis = Variables are independent of one another
#P-value must be below alpha to reject null
chi2_contingency(table)

Chi2ContingencyResult(statistic=0.04208754208754209, pvalue=0.8374529913100172, dof=1, expected_freq=array([[33., 22.],
       [27., 18.]]))

In [102]:
#Goodness of fit data - Dice rolls
rolls = np.random.choice([1, 2, 3, 4, 5, 6], size=100, p=[1/6]*6)
observed_counts = np.bincount(rolls)[1:]  
expected_counts = np.full(6, 100/6) 

print(observed_counts)
print(expected_counts)

[17 14 15 17 19 18]
[16.66666667 16.66666667 16.66666667 16.66666667 16.66666667 16.66666667]


In [104]:
#Chi-Squared Test for Goodness of Fit
#Null hypothesis - Observed data fits actual distribution
#P-value must be below alpha to reject null and conclude dice are unfair
chisquare(observed_counts, expected_counts)

Power_divergenceResult(statistic=1.04, pvalue=0.9592754057471847)

In [106]:
#Homogeneity data - Favorite fruit by gender
Male = np.random.choice(['Apples', 'Bananas', 'Cherries'], size=50, p=[0.3, 0.5, 0.2])
Female = np.random.choice(['Apples', 'Bananas', 'Cherries'], size=50, p=[0.3, 0.4, 0.3])
homogeneity_data = pd.DataFrame({
    'Group': ['Male']*50 + ['Female']*50,
    'Response': np.concatenate([male, female])
})
table = pd.crosstab(homogeneity_data['Group'], homogeneity_data['Response'])
table

Response,Apples,Bananas,Cherries
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,19,14,17
Male,17,24,9


In [108]:
#Chi-Squared Test for Homogeneity 
#Null hypothesis - Groups are homogenous
#P-value must be less than alpha to reject null and conclude that there is a significant difference between groups
chi2_contingency(table)

Chi2ContingencyResult(statistic=5.204228520017994, pvalue=0.07411671044624908, dof=2, expected_freq=array([[18., 19., 13.],
       [18., 19., 13.]]))