In [1]:
import pandas as pd
import os
from scipy.stats import ks_2samp, chi2_contingency

In [2]:
cwd = os.getcwd()
parent_dir = os.path.abspath(os.path.join(cwd, '..'))
os.chdir(parent_dir)

In [3]:
credit_risk_dir = os.path.join("data", "credit_risk")
credit_risk_file = os.path.join(credit_risk_dir, "dataproject2024.xlsx")

if os.path.exists(credit_risk_file):
    df = pd.read_excel(credit_risk_file, usecols=["Job tenure", "Age", 
                                                        "Car price", "Funding amount", "Down payment", 
                                                        "Loan duration", "Monthly payment", "Credit event", 
                                                        "Married", "Homeowner", "Default (y)"])
                                                        

In [4]:
df.head()

Unnamed: 0,Job tenure,Age,Car price,Funding amount,Down payment,Loan duration,Monthly payment,Credit event,Married,Homeowner,Default (y)
0,34,55,4875,3087,0,36,0.047895,0,1,1,0
1,5,29,13000,13000,0,60,0.091667,0,0,0,1
2,14,38,17190,14190,0,60,0.088235,0,0,0,0
3,16,37,22773,23568,0,48,0.110084,0,1,1,0
4,1,61,7700,8526,0,48,0.123404,0,1,0,1


In [5]:
def test_heterogeneity_within_dataset(df, subgroup_col, target_col, numeric_cols, categorical_cols):
    subgroups = df[subgroup_col].dropna().unique()
    results = {'numeric': {}, 'categorical': {}}

    print(f"Comparing subgroups within '{subgroup_col}' for heterogeneity in feature distributions...\n")

    # Numerical features
    for col in numeric_cols:
        print(f"{col} (KS Test)")
        for i in range(len(subgroups)):
            for j in range(i+1, len(subgroups)):
                g1 = df[df[subgroup_col] == subgroups[i]][col].dropna()
                g2 = df[df[subgroup_col] == subgroups[j]][col].dropna()
                stat, p = ks_2samp(g1, g2)
                print(f"{subgroups[i]} vs {subgroups[j]}: KS = {stat:.4f}, p = {p:.4f}")
                results['numeric'][(col, subgroups[i], subgroups[j])] = (stat, p)

    # Categorical features
    print("\nCategorical features (Chi-Square Test)")
    for col in categorical_cols:
        ct = pd.crosstab(df[col], df[subgroup_col])
        chi2, p, dof, _ = chi2_contingency(ct)
        print(f"{col}: Chi2 = {chi2:.4f}, p = {p:.4f}")
        results['categorical'][col] = (chi2, p)

    # Target distribution check
    print(f"\nTarget variable ('{target_col}') distribution by subgroup:")
    print(df.groupby(subgroup_col)[target_col].value_counts(normalize=True).unstack().round(3))

    return results


In [6]:
target_col = "Default (y)"
numeric_cols = ["Job tenure", "Age", "Car price", "Funding amount", "Loan duration", "Monthly payment"]
categorical_cols = ["Down payment", "Credit event", "Married", "Homeowner"]


for i in categorical_cols:
    subgroup_col = i
    categorical_cols_limited = [x for x in categorical_cols if x != i]

    results = test_heterogeneity_within_dataset(df, subgroup_col, target_col, numeric_cols, categorical_cols_limited)

Comparing subgroups within 'Down payment' for heterogeneity in feature distributions...

Job tenure (KS Test)
0 vs 1: KS = 0.1089, p = 0.0000
Age (KS Test)
0 vs 1: KS = 0.1452, p = 0.0000
Car price (KS Test)
0 vs 1: KS = 0.1173, p = 0.0000
Funding amount (KS Test)
0 vs 1: KS = 0.4593, p = 0.0000
Loan duration (KS Test)
0 vs 1: KS = 0.4000, p = 0.0000
Monthly payment (KS Test)
0 vs 1: KS = 0.1355, p = 0.0000

Categorical features (Chi-Square Test)
Credit event: Chi2 = 6.4234, p = 0.0113
Married: Chi2 = 5.7279, p = 0.0167
Homeowner: Chi2 = 16.7095, p = 0.0000

Target variable ('Default (y)') distribution by subgroup:
Default (y)       0      1
Down payment              
0             0.787  0.213
1             0.935  0.065
Comparing subgroups within 'Credit event' for heterogeneity in feature distributions...

Job tenure (KS Test)
0 vs 1: KS = 0.0449, p = 0.8884
Age (KS Test)
0 vs 1: KS = 0.0434, p = 0.9097
Car price (KS Test)
0 vs 1: KS = 0.1504, p = 0.0012
Funding amount (KS Test)
0 vs