In [3]:
import os
import time
import joblib
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import SequentialFeatureSelector

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
from scipy.stats import pearsonr

import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")


In [4]:

csv_path = "/Users/satvikkesarwani/Downloads/loan_dataset_20000.csv"
df = pd.read_csv(csv_path)
print("Loaded shape:", df.shape)
df.head()


Loaded shape: (20000, 22)


Unnamed: 0,age,gender,marital_status,education_level,annual_income,monthly_income,employment_status,debt_to_income_ratio,credit_score,loan_amount,...,loan_term,installment,grade_subgrade,num_of_open_accounts,total_credit_limit,current_balance,delinquency_history,public_records,num_of_delinquencies,loan_paid_back
0,59,Male,Married,Master's,24240.19,2020.02,Employed,0.074,743,17173.72,...,36,581.88,B5,7,40833.47,24302.07,1,0,1,1
1,72,Female,Married,Bachelor's,20172.98,1681.08,Employed,0.219,531,22663.89,...,60,573.17,F1,5,27968.01,10803.01,1,0,3,1
2,49,Female,Single,High School,26181.8,2181.82,Employed,0.234,779,3631.36,...,60,76.32,B4,2,15502.25,4505.44,0,0,0,1
3,35,Female,Single,High School,11873.84,989.49,Employed,0.264,809,14939.23,...,36,468.07,A5,7,18157.79,5525.63,4,0,5,1
4,63,Other,Single,Other,25326.44,2110.54,Employed,0.26,663,16551.71,...,60,395.5,D5,1,17467.56,3593.91,2,0,2,1


In [5]:
def map_target_to_binary(series):
    s = series.copy()
    if s.dtype == object or s.dtype.name == "category":
        s_lower = s.astype(str).str.strip().str.lower()
        pos = {"yes","y","true","t","paid","paid_back","1","1.0","accepted"}
        neg = {"no","n","false","f","not_paid","0","0.0","rejected"}

        mapped = s_lower.map(lambda v: 1 if v in pos else (0 if v in neg else np.nan))

        if mapped.isna().any():
            try:
                numeric = pd.to_numeric(s, errors="coerce")
                mapped = mapped.fillna((numeric != 0).astype(int))
            except:
                cats = list(pd.Categorical(s).categories)
                if len(cats) == 2:
                    mapped = mapped.fillna(series.map({cats[0]:0, cats[1]:1}))
                else:
                    mapped = mapped.fillna(0)
        return mapped.astype(int)
    else:
        try:
            num = pd.to_numeric(s, errors="coerce").fillna(0)
            if set(np.unique(num)) <= {0,1}:
                return num.astype(int)
            else:
                return (num != 0).astype(int)
        except:
            return (s != 0).astype(int)

y = map_target_to_binary(df["loan_paid_back"])
df = df.loc[y.notnull()].reset_index(drop=True)
y = y.loc[y.notnull()].reset_index(drop=True)
X = df.drop(columns=["loan_paid_back"])


In [6]:
def detect_columns(df):
    numeric_guess = [
        "age", "annual_income", "monthly_income", "debt_to_income_ratio", "credit_score",
        "loan_amount", "interest_rate", "loan_term", "installment",
        "num_of_open_accounts", "total_credit_limit", "current_balance",
        "num_of_delinquencies"
    ]

    categorical_guess = [
        "gender", "marital_status", "education_level", "employment_status",
        "loan_purpose", "grade_subgrade",
        "delinquency_history", "public_records"
    ]

    numeric_cols = [c for c in numeric_guess if c in df.columns]
    categorical_cols = [c for c in categorical_guess if c in df.columns]

    other = [c for c in df.columns if c not in numeric_cols and c not in categorical_cols]
    for c in other:
        if df[c].dtype == object or df[c].nunique() < 30:
            categorical_cols.append(c)
        else:
            numeric_cols.append(c)

    return numeric_cols, categorical_cols

numeric_cols, categorical_cols = detect_columns(X)
numeric_cols, categorical_cols


(['age',
  'annual_income',
  'monthly_income',
  'debt_to_income_ratio',
  'credit_score',
  'loan_amount',
  'interest_rate',
  'loan_term',
  'installment',
  'num_of_open_accounts',
  'total_credit_limit',
  'current_balance',
  'num_of_delinquencies'],
 ['gender',
  'marital_status',
  'education_level',
  'employment_status',
  'loan_purpose',
  'grade_subgrade',
  'delinquency_history',
  'public_records'])

In [8]:
def build_preprocessor(numeric_cols, categorical_cols):
    num_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

    cat_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
    ])

    return ColumnTransformer([
        ("num", num_pipe, numeric_cols),
        ("cat", cat_pipe, categorical_cols)
    ])

preprocessor = build_preprocessor(numeric_cols, categorical_cols)

# Fit + transform
preprocessor.fit(X)
X_trans = preprocessor.transform(X)

# build feature names
ohe_names = []
if categorical_cols:
    ohe = preprocessor.named_transformers_["cat"].named_steps["onehot"]
    ohe_names = list(ohe.get_feature_names_out(categorical_cols))

feature_names = numeric_cols + ohe_names

len(feature_names)


82

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X_trans, y, test_size=0.30, random_state=42, stratify=y
)

X_train.shape, X_test.shape


((14000, 82), (6000, 82))

In [10]:
def try_smote(X, y):
    try:
        from imblearn.over_sampling import SMOTE
        sm = SMOTE(random_state=42)
        return sm.fit_resample(X, y), True
    except:
        return None, False

def upsample(X, y):
    dfx = pd.DataFrame(X)
    dfx["target"] = y
    maj = dfx["target"].mode()[0]
    majority = dfx[dfx["target"] == maj]
    minority = dfx[dfx["target"] != maj]

    minority_up = minority.sample(len(majority), replace=True, random_state=42)
    final = pd.concat([majority, minority_up]).sample(frac=1, random_state=42)

    return final.drop(columns=["target"]).values, final["target"].values


In [11]:
def evaluate(m, X_t, y_t):
    y_pred = m.predict(X_t)
    try:
        y_prob = m.predict_proba(X_t)[:,1]
    except:
        from scipy.special import expit
        y_prob = expit(m.decision_function(X_t))
    return {
        "accuracy": accuracy_score(y_t, y_pred),
        "precision": precision_score(y_t, y_pred, zero_division=0),
        "recall": recall_score(y_t, y_pred, zero_division=0),
        "f1": f1_score(y_t, y_pred, zero_division=0),
        "roc_auc": roc_auc_score(y_t, y_prob)
    }

def run_experiment(name, X_train_full, y_train, X_test_full, y_test,
                   feature_names, sfs_k=10, use_smote=True):

    X_train_proc = X_train_full.copy()
    X_test_proc = X_test_full.copy()
    fnames = feature_names.copy()

    if name == "corr":
        corrs = []
        for i in range(X_train_full.shape[1]):
            corrs.append(abs(pearsonr(X_train_full[:,i], y_train)[0]))

        corrs = np.array(corrs)
        sel = np.where(corrs >= 0.05)[0]
        if len(sel) == 0:
            sel = np.argsort(corrs)[-20:]

        X_train_proc = X_train_full[:,sel]
        X_test_proc = X_test_full[:,sel]
        fnames = [feature_names[i] for i in sel]

    if name == "sfs":
        base = MLPClassifier(hidden_layer_sizes=(64,32), max_iter=200, random_state=42)
        sfs = SequentialFeatureSelector(base, n_features_to_select=sfs_k,
                                        scoring="roc_auc", cv=5, n_jobs=-1)
        sfs.fit(X_train_full, y_train)
        sel = np.where(sfs.get_support())[0]

        X_train_proc = X_train_full[:,sel]
        X_test_proc = X_test_full[:,sel]
        fnames = [feature_names[i] for i in sel]

    res, ok = try_smote(X_train_proc, y_train)
    if ok:
        (X_train_bal, y_train_bal) = res
    else:
        X_train_bal, y_train_bal = upsample(X_train_proc, y_train)

    models = {
        "MLP": MLPClassifier(hidden_layer_sizes=(128,64), max_iter=300),
        "KNN": KNeighborsClassifier(n_neighbors=5),
        "SVM": SVC(probability=True),
        "Logistic": LogisticRegression(max_iter=1000),
        "RF": RandomForestClassifier(n_estimators=200)
    }

    rows = []
    best = {"roc_auc": -1, "model": None, "estimator": None, "features": fnames}

    for name_, model in models.items():
        t0 = time.time()
        model.fit(X_train_bal, y_train_bal)
        t1 = time.time()

        scores = evaluate(model, X_test_proc, y_test)
        scores["model"] = name_
        scores["train_time"] = round(t1-t0,3)
        scores["features_used"] = X_train_bal.shape[1]

        if scores["roc_auc"] > best["roc_auc"]:
            best.update({"roc_auc": scores["roc_auc"], "model": name_, "estimator": model, "features": fnames})

        rows.append(scores)

    return pd.DataFrame(rows), best


In [12]:
experiments = ["full", "corr", "sfs"]
overall_best = {"roc_auc": -1}

for exp in experiments:
    print(f"\n===== Running {exp.upper()} =====")
    dfres, best_local = run_experiment(exp, X_train, y_train.values,
                                       X_test, y_test.values,
                                       feature_names,
                                       sfs_k=10,
                                       use_smote=True)

    display(dfres)

    if best_local["roc_auc"] > overall_best["roc_auc"]:
        overall_best = {
            "exp": exp,
            "roc_auc": best_local["roc_auc"],
            "model": best_local["model"],
            "estimator": best_local["estimator"],
            "features": best_local["features"]
        }

overall_best



===== Running FULL =====


Unnamed: 0,accuracy,precision,recall,f1,roc_auc,model,train_time,features_used
0,0.845167,0.90029,0.906856,0.903561,0.85086,MLP,39.102,82
1,0.699333,0.871496,0.732028,0.795696,0.693323,KNN,0.004,82
2,0.836667,0.91091,0.882059,0.896252,0.860318,SVM,145.013,82
3,0.814833,0.924689,0.836633,0.87846,0.881503,Logistic,0.306,82
4,0.896667,0.895065,0.986456,0.938541,0.874093,RF,8.727,82



===== Running CORR =====


Unnamed: 0,accuracy,precision,recall,f1,roc_auc,model,train_time,features_used
0,0.836,0.903362,0.890185,0.896725,0.853571,MLP,88.643,14
1,0.772833,0.91199,0.792457,0.848032,0.806929,KNN,0.028,14
2,0.853167,0.91364,0.901646,0.907604,0.883367,SVM,61.909,14
3,0.814,0.92519,0.834966,0.877766,0.883368,Logistic,0.032,14
4,0.884667,0.89897,0.964159,0.930424,0.865634,RF,3.919,14



===== Running SFS =====


Unnamed: 0,accuracy,precision,recall,f1,roc_auc,model,train_time,features_used
0,0.793833,0.926077,0.806626,0.862234,0.882369,MLP,34.203,10
1,0.774333,0.91516,0.791207,0.848681,0.813476,KNN,0.018,10
2,0.8465,0.920607,0.884351,0.902115,0.887019,SVM,34.3,10
3,0.812833,0.926846,0.831632,0.876661,0.883441,Logistic,0.064,10
4,0.883,0.902377,0.957283,0.929019,0.868142,RF,2.721,10


{'exp': 'sfs',
 'roc_auc': 0.887018510482773,
 'model': 'SVM',
 'estimator': SVC(probability=True),
 'features': ['debt_to_income_ratio',
  'credit_score',
  'interest_rate',
  'employment_status_Retired',
  'employment_status_Student',
  'employment_status_Unemployed',
  'grade_subgrade_A5',
  'grade_subgrade_C4',
  'grade_subgrade_F4',
  'public_records_0']}