**Table of contents**<a id='toc0_'></a>    
1.1. [Load Data](#toc1_1_)    
1.2. [Basic Summary Statistics](#toc1_2_)    
1.3. [Data Visualization](#toc1_3_)    
1.4. [Assumptions Check](#toc1_4_)    
1.4.1. [Normality](#toc1_4_1_)    
1.4.2. [Robustness](#toc1_4_2_)    

<!-- vscode-jupyter-toc-config
	numbering=true
	anchor=true
	flat=true
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import scipy.stats as stats
import ipywidgets as widgets

# force reload modules
%load_ext autoreload
%autoreload

## 1.1. <a id='toc1_1_'></a>[Load Data](#toc0_)

In [2]:
df = pd.read_excel("data/data.xlsx", sheet_name='cat_univar')
df.head()

Unnamed: 0,Opinion
0,yes
1,yes
2,yes
3,yes
4,no


In [3]:
df.value_counts()

Opinion
no         71
yes        68
Name: count, dtype: int64

## 1.2. <a id='toc1_2_'></a>[Basic Summary Statistics](#toc0_)

In [4]:
data = df.copy()

In [5]:
X = data.values.flatten()
n = 50
X = np.random.choice(['yes', 'no'], size=n, p=[22/50, 28/50])
X = ['yes'] * 25 + ['no'] * 15 + ['neut'] * 5
X= np.array(X, dtype='object')

In [6]:
summary_stats = pd.Series(X).describe()
summary_stats

count      45
unique      3
top       yes
freq       25
dtype: object

## 1.3. <a id='toc1_3_'></a>[Data Visualization](#toc0_)

In [7]:
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.stats.stattools import medcouple

def plot_univariate_cat(X, y_stats='proportion'):
	plt.figure(figsize=(5,3))
	# plot conts values using plt
	sns.countplot(x=X, order=np.unique(X), palette='pastel', hue=X, edgecolor='black', stat=y_stats, width=0.5)
	# set yes/no labels with counts
	counts = pd.Series(X).value_counts()
	labels = [f'# yes = {counts["yes"]}', f'# no = {counts["no"]}']
	plt.xticks(ticks=[1, 0], labels=labels)
	plt.title('Categorical Variable Distribution')

widgets.interact(
    plot_univariate_cat,
    X=widgets.fixed(X),
    y_stats=widgets.Dropdown(options=['count', 'proportion', 'probability', 'percent'], description='Stats:')
    );


interactive(children=(Dropdown(description='Stats:', options=('count', 'proportion', 'probability', 'percent')â€¦

## 1.4. <a id='toc1_4_'></a>[One Proportion Test](#toc0_)

#### 2 Categories

"""
Utility to recommend which test to use for a 1-sample proportion:
- Exact binomial test (`scipy.stats.binomtest`)
- Normal approximation z-test (`statsmodels.stats.proportion.proportions_ztest`)

Heuristics used:
1. If n is small  -> prefer binomtest
2. If expected counts (n * p0 or n * (1 - p0)) are small -> prefer binomtest
3. If hypothesized proportion p0 is very close to 0 or 1 (skewed) -> prefer binomtest
Otherwise -> z-test is fine.
"""

In [19]:
# Data
count =  np.sum(X == 'yes') # Number of successes observed
n = len(X)  # Total number of observations
p =  count / n # Hypothesized population proportion P0 = P
p0 = 0.33

In [22]:
from statsmodels.stats.proportion import proportions_ztest



# Perform one-sample z-test for proportion
stat, p_value = proportions_ztest(count=count, nobs=n, value=p0, alternative='two-sided')

print(f"Successes: {count}")
print(f"Total n: {n}")
print(f"Hypothesized proportion p0 : {p0}")
print("Z-statistic:", stat)
print("p-value:", p_value)

# Simple interpretation at alpha = 0.05
alpha = 0.05
if p_value < alpha:
    print("Reject H0: The true proportion is significantly different from", p0)
else:
    print("Fail to reject H0: No evidence that the true proportion differs from", p0)

Successes: 25
Total n: 45
Hypothesized proportion p0 : 0.3
Z-statistic: 3.4500000000000006
p-value: 0.0005605865536323534
Reject H0: The true proportion is significantly different from 0.3


In [23]:
# Perform the exact binomial test
p0 = 0.3
result = stats.binomtest(k=count, n=n, p=p0, alternative='two-sided')
print(f"P-value: {result.pvalue}")
print(f"Test statistic (proportion estimate): {result.statistic}")

# Access the confidence interval
ci = result.proportion_ci(confidence_level=0.95)
print(f"95% Confidence Interval: [{ci.low:.2f}, {ci.high:.2f}]")

# Interpretation:
alpha = 0.05
if result.pvalue < alpha:
    print("Reject H0: The true proportion is significantly different from", p0)
else:
    print("Fail to reject H0: No evidence that the true proportion differs from", p0)

P-value: 0.00045518593496529967
Test statistic (proportion estimate): 0.5555555555555556
95% Confidence Interval: [0.40, 0.70]
Reject H0: The true proportion is significantly different from 0.3


#### 3 or more Categories

In [28]:
# Observed frequencies for the 3 categories
observed = np.unique_counts(X).counts
n_cat = len(observed)
# Total number of observations
n = observed.sum()

# Expected frequencies under H0: uniform distribution across 3 categories
expected = np.array([n / n_cat] * n_cat)

# Perform chi-square goodness-of-fit test
chi2_stat, p_value = stats.chisquare(f_obs=observed, f_exp=expected)

print("Observed counts:", observed)
print("Expected counts:", expected)
print("Chi-square statistic:", chi2_stat)
print("p-value:", p_value)

# Optional: simple interpretation at alpha = 0.05
alpha = 0.05
if p_value < alpha:
    print("Reject H0: The distribution is not uniform across the 3 categories.")
else:
    print("Fail to reject H0: No evidence against a uniform distribution across the 3 categories.")

Observed counts: [ 5 15 25]
Expected counts: [15. 15. 15.]
Chi-square statistic: 13.333333333333334
p-value: 0.0012726338013398079
Reject H0: The distribution is not uniform across the 3 categories.


* **Post-Hoc Analysis**: Pairewise chi-square propotion test

In [36]:
import numpy as np
import pandas as pd
from scipy.stats import chisquare
from itertools import combinations
from collections import Counter

if p_value < alpha:
    print("__________Need Post-Hoc Analysis___________")
    
    # Get counts
    counts = Counter(X)
    alpha = 0.05
    
    # Bonferroni correction
    n_comparisons = len(list(combinations(counts.keys(), 2)))
    # Bonferroni correction to account for multiple comparisons (optional but recommended)
    use_correction = True
    alpha_corrected = alpha / n_comparisons
    
    # Pairwise chi-square tests using combinations
    results = []
    for cat1, cat2 in combinations(counts.keys(), 2):
        obs = [counts[cat1], counts[cat2]]
        chi2, pval = chisquare(obs)
        results.append({
			'group1': cat1, 
			'group2': cat2, 
			'Chi2': chi2, 
			'p-value': pval, 
			'p-value_corrected': min(pval * n_comparisons, 1.0),
			'significant': pval < alpha_corrected if use_correction else pval < alpha
		})
    df_results = pd.DataFrame(results)
    print(f"Alpha: {alpha}, Bonferroni-corrected alpha: {alpha_corrected:.4f} (n_comparisons={n_comparisons})\n")
    display(df_results)
else:
    print("No need to do Post-Hoc Analysis because p-value is not significant")

__________Need Post-Hoc Analysis___________
Alpha: 0.05, Bonferroni-corrected alpha: 0.0167 (n_comparisons=3)



Unnamed: 0,group1,group2,Chi2,p-value,p-value_corrected,significant
0,yes,no,2.5,0.113846,0.341539,False
1,yes,neut,13.333333,0.000261,0.000782,True
2,no,neut,5.0,0.025347,0.076042,False
