In [1]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind, chi2_contingency
from IPython.display import display
import sys
import os
sys.path.append(os.path.abspath("..")) 
from utils.variables import bin_vars, cat_vars, numeric_vars

In [2]:
# Load the dataset
df = pd.read_csv('../data/student_preprocessed.csv')

In [3]:
# 1. T-tests between groups for numeric variables

# Collect results
results = []

for var in bin_vars:
    for col in numeric_vars:
        group0 = df[df[var] == 0][col]
        group1 = df[df[var] == 1][col]
        
        # Calculate variance ratio
        var0 = np.var(group0, ddof=1)
        var1 = np.var(group1, ddof=1)
        ratio = max(var0, var1) / min(var0, var1)
        
        # Choose t-test type
        if ratio > 1.5:
            t_stat, p_val = ttest_ind(group0, group1, equal_var=False)
            test_type = "Welch's t-test"
        else:
            t_stat, p_val = ttest_ind(group0, group1, equal_var=True)
            test_type = "Student's t-test"
        
        results.append({
            "Group Variable": var,
            "Numeric Variable": col,
            "Test Type": test_type,
            "t-statistic": round(t_stat, 3),
            "p-value": round(p_val, 5),
            "Variance Ratio": round(ratio, 2)
        })

# Convert to DataFrame
t_results_df = pd.DataFrame(results)

# Sort by p-value (ascending)
t_results_df = t_results_df.sort_values(by="p-value", ascending=True).reset_index(drop=True)

# Display nicely
display(t_results_df)  # show top 20 smallest p-values


Unnamed: 0,Group Variable,Numeric Variable,Test Type,t-statistic,p-value,Variance Ratio
0,romantic,age,Student's t-test,-4.623,0.00000,1.17
1,higher,G1,Welch's t-test,-12.434,0.00000,2.01
2,sex,Dalc,Welch's t-test,-6.850,0.00000,2.99
3,sex,Walc,Welch's t-test,-8.187,0.00000,1.78
4,internet,Fedu,Student's t-test,-4.748,0.00000,1.28
...,...,...,...,...,...,...
203,nursery,health,Student's t-test,-0.043,0.96550,1.05
204,Pstatus,G3,Student's t-test,0.019,0.98471,1.01
205,schoolsup,failures,Student's t-test,0.019,0.98488,1.10
206,activities,failures,Student's t-test,-0.014,0.98862,1.37


In [4]:
# 2. Chi-square tests for categorical variables

# Store results
results = []

# Loop over unique pairs
for i in range(len(cat_vars)):
    for j in range(i+1, len(cat_vars)):
        var1 = cat_vars[i]
        var2 = cat_vars[j]
        
        # Contingency table
        table = pd.crosstab(df[var1], df[var2])
        
        # Chi-square test
        chi2, p, dof, expected = chi2_contingency(table)
        
        # Cramer's V effect size
        n = table.sum().sum()
        min_dim = min(table.shape) - 1
        cramers_v = np.sqrt(chi2 / (n * min_dim)) if min_dim > 0 else np.nan
        
        # Append to results
        results.append({
            'Variable 1': var1,
            'Variable 2': var2,
            'Chi2': chi2,
            'p-value': p,
            'dof': dof,
            "Cramer's V": cramers_v
        })

# Convert to DataFrame
chi2_df = pd.DataFrame(results)

# Sort by p-value or effect size if desired
chi2_df = chi2_df.sort_values('p-value')

# Display
pd.set_option('display.max_rows', None)
display(chi2_df)



Unnamed: 0,Variable 1,Variable 2,Chi2,p-value,dof,Cramer's V
27,school,school,644.601517,3.335663e-142,1,0.996606
378,Dalc,Walc,442.393095,4.575918e-84,16,0.412812
286,Medu,Fedu,388.485086,9.405231000000001e-73,16,0.386843
296,Medu,Mjob,378.367555,1.231884e-70,16,0.381772
311,Fedu,Fjob,200.765958,5.5522349999999995e-34,16,0.278095
361,freetime,goout,144.610236,8.986106e-23,16,0.236019
371,goout,Walc,137.689487,2.039735e-21,16,0.230302
396,Mjob,Fjob,134.382083,9.016148e-21,16,0.227519
1,school,address,79.958681,3.823217e-19,1,0.351003
80,address,school,79.958681,3.823217e-19,1,0.351003
