> # Chi Square

In [1]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.preprocessing import KBinsDiscretizer
from scipy.stats import chi2_contingency

In [2]:
# Load the iris dataset
data = load_iris()
X = data.data
y = data.target

In [3]:
# Let's use only the first feature (sepal length) for this example
sepal_length = X[:, 0]

In [4]:
# Convert the continuous feature to categorical using KBinsDiscretizer
n_bins = 5  # Number of bins for discretization
discretizer = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='uniform')
sepal_length_discrete = discretizer.fit_transform(sepal_length.reshape(-1, 1))

In [5]:
# Flatten the discrete feature and convert it to integers
sepal_length_discrete = sepal_length_discrete.flatten().astype(int)

In [6]:
# Create a contingency table
contingency_table = np.zeros((n_bins, len(data.target_names)))
for i in range(len(data.target_names)):
    for j in range(n_bins):
        contingency_table[j, i] = np.sum((sepal_length_discrete == j) & (y == i))

In [7]:
# Perform the Chi-square test
chi2, p, dof, expected = chi2_contingency(contingency_table)

print("Chi-square statistic:", chi2)
print("P-value:", p)
print("Degrees of freedom:", dof)

Chi-square statistic: 112.098649825784
P-value: 1.409589192040307e-20
Degrees of freedom: 8
