In [5]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind, chi2_contingency
from IPython.display import display
import sys
import os
sys.path.append(os.path.abspath("..")) 
from utils.variables import bin_vars, cat_vars, numeric_vars
pd.set_option('display.max_rows', None) 

In [2]:
# Load the dataset
df = pd.read_csv('../data/student_preprocessed.csv')

In [6]:
# 1. T-tests between groups for numeric variables

# Collect results
results = []

for var in bin_vars:
    for col in numeric_vars:
        group0 = df[df[var] == 0][col]
        group1 = df[df[var] == 1][col]
        
        # Calculate variance ratio
        var0 = np.var(group0, ddof=1)
        var1 = np.var(group1, ddof=1)
        ratio = max(var0, var1) / min(var0, var1)
        
        # Choose t-test type
        if ratio > 1.5:
            t_stat, p_val = ttest_ind(group0, group1, equal_var=False)
            test_type = "Welch's t-test"
        else:
            t_stat, p_val = ttest_ind(group0, group1, equal_var=True)
            test_type = "Student's t-test"
        
        results.append({
            "Group Variable": var,
            "Numeric Variable": col,
            "Test Type": test_type,
            "t-statistic": round(t_stat, 3),
            "p-value": round(p_val, 5),
            "Variance Ratio": round(ratio, 2)
        })

# Convert to DataFrame
t_results_df = pd.DataFrame(results)

# Sort by p-value (ascending)
t_results_df = t_results_df.sort_values(by="p-value", ascending=True).reset_index(drop=True)

# Display nicely
display(t_results_df) 


Unnamed: 0,Group Variable,Numeric Variable,Test Type,t-statistic,p-value,Variance Ratio
0,higher_binary,Fedu,Welch's t-test,-6.459,0.0,1.96
1,sex_binary,Walc,Welch's t-test,-8.187,0.0,1.78
2,sex_binary,Dalc,Welch's t-test,-6.85,0.0,2.99
3,internet_binary,Medu,Student's t-test,-7.02,0.0,1.13
4,internet_binary,Fedu,Student's t-test,-4.748,0.0,1.28
5,higher_binary,studytime,Student's t-test,-4.876,0.0,1.24
6,higher_binary,failures,Welch's t-test,5.062,0.0,3.73
7,sex_binary,studytime,Student's t-test,5.361,0.0,1.06
8,higher_binary,Medu,Welch's t-test,-7.016,0.0,1.81
9,romantic_binary,age,Student's t-test,-4.623,0.0,1.17


In [4]:
# 2. Chi-square tests for categorical variables

# Store results
results = []

# Loop over unique pairs
for i in range(len(cat_vars)):
    for j in range(i+1, len(cat_vars)):
        var1 = cat_vars[i]
        var2 = cat_vars[j]
        
        # Contingency table
        table = pd.crosstab(df[var1], df[var2])
        
        # Chi-square test
        chi2, p, dof, expected = chi2_contingency(table)
        
        # Cramer's V effect size
        n = table.sum().sum()
        min_dim = min(table.shape) - 1
        cramers_v = np.sqrt(chi2 / (n * min_dim)) if min_dim > 0 else np.nan
        
        # Append to results
        results.append({
            'Variable 1': var1,
            'Variable 2': var2,
            'Chi2': chi2,
            'p-value': p,
            'dof': dof,
            "Cramer's V": cramers_v
        })

# Convert to DataFrame
chi2_df = pd.DataFrame(results)

# Sort by p-value or effect size if desired
chi2_df = chi2_df.sort_values('p-value')

# Display
pd.set_option('display.max_rows', None)
display(chi2_df)



Unnamed: 0,Variable 1,Variable 2,Chi2,p-value,dof,Cramer's V
187,Dalc,Walc,442.393095,4.575918e-84,16,0.412812
135,Medu,Fedu,388.485086,9.405231000000001e-73,16,0.386843
180,freetime,goout,144.610236,8.986106e-23,16,0.236019
185,goout,Walc,137.689487,2.039735e-21,16,0.230302
17,sex_binary,Walc,75.112528,1.886326e-15,4,0.3402
105,higher_binary,failures,67.073982,1.805605e-14,3,0.321481
16,sex_binary,Dalc,57.773635,8.513228e-12,4,0.298361
113,internet_binary,Medu,48.732837,6.638317e-10,4,0.274024
11,sex_binary,studytime,43.276444,2.149879e-09,3,0.258228
104,higher_binary,studytime,34.300357,1.712027e-07,3,0.229894
