In [34]:
import pandas as pd
from scipy.stats import ttest_ind, chi2_contingency

In [35]:
import sys
import os

# Add project root to sys.path
sys.path.append(os.path.abspath("..")) # go one level up

from src.utils.paths import PROCESSED_DATA_DIR
from src.utils.io import load_csv

df = load_csv(PROCESSED_DATA_DIR / "validated_df.csv")

In [37]:
# Columns classification
numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

categorical_cols = [
    'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService',
    'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
    'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
    'Contract', 'PaperlessBilling', 'PaymentMethod'
]

# Target Variable
target = 'Churn'

In [38]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

##### **1. t-test for Numeric Features**

In [23]:
print('--- T-test for Numerical Columns ---')
for col in numeric_cols:
    churned  = df[df[target] == 'Yes'][col]
    retained = df[df[target] == 'No'][col]
    stat, pval = ttest_ind(churned, retained, nan_policy='omit')
    print(f'{col}: T-stat={stat:.2f}, p-value={pval:.4f}')

--- T-test for Numerical Columns ---
tenure: T-stat=-31.58, p-value=0.0000
MonthlyCharges: T-stat=16.54, p-value=0.0000
TotalCharges: T-stat=-17.07, p-value=0.0000


All three numeric features are statistically significant for churn.

##### **2. Chi Square Tests for Categorical Columns**

In [27]:
print('--- Chi Square Tests for Categorical Columns ---')
# looping through categorical columns
for col in categorical_cols:
    #contingency table: counts of each combination of column category and Churn
    contingency = pd.crosstab(df[col], df[target])
    chi2, p, dof, expected = chi2_contingency(contingency)
    print(f'{col}: Chi2={chi2:.2f}, p-value={p:.4f}')


    # Perform Chi-Square test
    # Returns:
    # chi2 -> Chi-Square statistic (strength of association)
    # p    -> p-value (significance of association)
    # dof  -> degrees of freedom (used in chi-square calculation)
    # expected -> expected frequencies if no association

--- Chi Square Tests for Categorical Columns ---
gender: Chi2=0.48, p-value=0.4866
SeniorCitizen: Chi2=159.43, p-value=0.0000
Partner: Chi2=158.73, p-value=0.0000
Dependents: Chi2=189.13, p-value=0.0000
PhoneService: Chi2=0.92, p-value=0.3388
MultipleLines: Chi2=11.33, p-value=0.0035
InternetService: Chi2=732.31, p-value=0.0000
OnlineSecurity: Chi2=850.00, p-value=0.0000
OnlineBackup: Chi2=601.81, p-value=0.0000
DeviceProtection: Chi2=558.42, p-value=0.0000
TechSupport: Chi2=828.20, p-value=0.0000
StreamingTV: Chi2=374.20, p-value=0.0000
StreamingMovies: Chi2=375.66, p-value=0.0000
Contract: Chi2=1184.60, p-value=0.0000
PaperlessBilling: Chi2=258.28, p-value=0.0000
PaymentMethod: Chi2=648.14, p-value=0.0000


- Gender, and Phone Services are significant and not linked for Churned.