In [None]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind, chi2_contingency
from IPython.display import display
import sys
import os
sys.path.append(os.path.abspath("..")) 
from utils.variables import bin_vars, cat_vars, numeric_vars
pd.set_option('display.max_rows', None) 

In [None]:
# Load the dataset
df = pd.read_csv('../data/student_preprocessed.csv')

In [None]:
# 1. T-tests between groups for numeric variables

# Collect results
results = []

for var in bin_vars:
    for col in numeric_vars:
        group0 = df[df[var] == 0][col]
        group1 = df[df[var] == 1][col]
        
        # Calculate variance ratio
        var0 = np.var(group0, ddof=1)
        var1 = np.var(group1, ddof=1)
        ratio = max(var0, var1) / min(var0, var1)
        
        # Choose t-test type
        if ratio > 1.5:
            t_stat, p_val = ttest_ind(group0, group1, equal_var=False)
            test_type = "Welch's t-test"
        else:
            t_stat, p_val = ttest_ind(group0, group1, equal_var=True)
            test_type = "Student's t-test"
        
        results.append({
            "Group Variable": var,
            "Numeric Variable": col,
            "Test Type": test_type,
            "t-statistic": round(t_stat, 3),
            "p-value": round(p_val, 5),
            "Variance Ratio": round(ratio, 2)
        })

# Convert to DataFrame
t_results_df = pd.DataFrame(results)

# Sort by p-value (ascending)
t_results_df = t_results_df.sort_values(by="p-value", ascending=True).reset_index(drop=True)

# Display 
display(t_results_df) 


In [None]:
# 2. Chi-square tests for categorical variables

# Store results
results = []

# Loop over unique pairs
for i in range(len(cat_vars)):
    for j in range(i+1, len(cat_vars)):
        var1 = cat_vars[i]
        var2 = cat_vars[j]
        
        # Contingency table
        table = pd.crosstab(df[var1], df[var2])
        
        # Chi-square test
        chi2, p, dof, expected = chi2_contingency(table)
        
        # Cramer's V effect size
        n = table.sum().sum()
        min_dim = min(table.shape) - 1
        cramers_v = np.sqrt(chi2 / (n * min_dim)) if min_dim > 0 else np.nan
        
        # Append to results
        results.append({
            'Variable 1': var1,
            'Variable 2': var2,
            'Chi2': chi2,
            'p-value': p,
            'dof': dof,
            "Cramer's V": cramers_v
        })

# Convert to DataFrame
chi2_df = pd.DataFrame(results)

# Sort by p-value or effect size if desired
chi2_df = chi2_df.sort_values('p-value')

# Display
pd.set_option('display.max_rows', None)
display(chi2_df)

