Step 1: Install dependencies

In [None]:
!pip install pandas numpy scikit-learn xgboost --upgrade shap matplotlib joblib

Step 2: Import libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import shap
import joblib
import os
import re

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve, brier_score_loss, precision_recall_fscore_support
import xgboost as xgb

Step 3: Load and clean dataset

In [None]:
df = pd.read_csv("a_Dataset_CreditScoring.csv")
df = df.drop('ID', axis=1)

for col in df.columns:
    if df[col].dtype == 'object':

        # --- Clean the column in one step ---
        # 1. Check if the column contains '%' anywhere (indicating a percentage feature)
        is_percentage_col = df[col].astype(str).str.contains('%').any()

        # 2. Universal cleaning: remove '$', ',', and spaces.
        df[col] = df[col].astype(str).str.replace(r'[\$,\s]', '', regex=True)

        # 3. Handle percentage conversion BEFORE the main numeric conversion
        if is_percentage_col:
            # Remove '%' and try to convert to float (Coerce errors for NaN)
            converted = pd.to_numeric(
                df[col].str.replace(r'%', '', regex=False),
                errors='coerce'
            )

            # If conversion is successful, divide by 100 to get proportion [0, 1]
            if converted.notna().mean() > 0.5:
                df[col] = converted / 100
                continue # Skip the general conversion below if this was a percentage column

        # --- General Numeric Conversion (for $ columns) ---
        converted = pd.to_numeric(df[col], errors='coerce')

        # If more than 50% of values can be converted, change the column type
        if converted.notna().mean() > 0.5:
            df[col] = converted

pd.set_option('display.max_columns', None)
df.head()

Unnamed: 0,TARGET,DerogCnt,CollectCnt,BanruptcyInd,InqCnt06,InqTimeLast,InqFinanceCnt24,TLTimeFirst,TLTimeLast,TLCnt03,TLCnt12,TLCnt24,TLCnt,TLSum,TLMaxSum,TLSatCnt,TLDel60Cnt,TLBadCnt24,TL75UtilCnt,TL50UtilCnt,TLBalHCPct,TLSatPct,TLDel3060Cnt24,TLDel90Cnt24,TLDel60CntAll,TLOpenPct,TLBadDerogCnt,TLDel60Cnt24,TLOpen24Pct
0,1,3,3,0,4,0.0,5,117,27,0,0,0,5.0,19410.0,21147.0,16.0,2,1,3.0,3.0,0.92,0.21,2,3,7,0.21,4,4,0.0
1,1,15,9,0,3,1.0,3,14,14,0,0,1,1.0,16776.0,20971.0,1.0,0,0,1.0,1.0,0.8,0.0,0,0,0,1.0,12,0,1.0
2,1,0,0,0,1,5.0,1,354,7,0,2,10,19.0,16903.0,47593.0,21.0,1,1,4.0,5.0,0.36,0.65,0,1,1,0.73,1,1,0.53
3,1,8,5,0,6,1.0,10,16,4,0,2,4,3.0,18339.0,20093.0,1.0,1,0,2.0,3.0,0.91,0.25,1,1,1,0.75,7,1,1.33
4,1,3,1,0,9,0.0,8,130,52,0,0,0,1.0,2327.0,1860.0,3.0,4,1,1.0,1.0,1.25,0.0,0,1,4,0.14,3,1,0.0


Step 4: Split data into train/validation/test

In [None]:
TARGET_COL = "TARGET"
RANDOM_STATE = 42
TEST_SIZE = 0.2

X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL]

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=TEST_SIZE, stratify=y, random_state=RANDOM_STATE
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=RANDOM_STATE
)

X_train.shape, X_val.shape, X_test.shape

((2400, 28), (300, 28), (300, 28))

Step 5: Build preprocessing pipeline

In [None]:
numeric_features = X_train.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X_train.select_dtypes(include=["object"]).columns

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

# To show how many columns are numeric and categorical features
len(numeric_features), len(categorical_features)

(28, 0)

Step 6: Logistic Regression (baseline)

In [None]:
log_reg = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000, penalty='l2'))
])

log_reg.fit(X_train, y_train)

y_val_pred = log_reg.predict_proba(X_val)[:, 1]
auc = roc_auc_score(y_val, y_val_pred)
print("Logistic Regression AUC:", auc)


Logistic Regression AUC: 0.7988000000000001


Step 7: XGBoost model (strong model)

In [None]:
xgb_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("xgb", xgb.XGBClassifier(
        objective="binary:logistic",
        eval_metric="auc",
        random_state=RANDOM_STATE
    ))
])

# Simple parameter tuning
param_grid = {
    "xgb__max_depth": [3, 5],
    "xgb__n_estimators": [100, 200],
    "xgb__learning_rate": [0.05, 0.1]
}

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)
grid_search = GridSearchCV(xgb_model, param_grid, scoring="roc_auc", cv=cv, verbose=1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
print("Best parameters:", grid_search.best_params_)

y_val_pred = best_model.predict_proba(X_val)[:, 1]
auc = roc_auc_score(y_val, y_val_pred)
print("XGBoost AUC:", auc)


Fitting 3 folds for each of 8 candidates, totalling 24 fits
Best parameters: {'xgb__learning_rate': 0.05, 'xgb__max_depth': 3, 'xgb__n_estimators': 100}
XGBoost AUC: 0.79928


Step 8: Model evaluation (AUC, KS, Gini)

In [None]:
def gini(auc):
    return 2 * auc - 1

def ks_stat(y_true, y_score):
    fpr, tpr, _ = roc_curve(y_true, y_score)
    return max(tpr - fpr)

val_auc = roc_auc_score(y_val, y_val_pred)
val_gini = gini(val_auc)
val_ks = ks_stat(y_val, y_val_pred)

print(f"AUC: {val_auc:.4f}")
print(f"Gini: {val_gini:.4f}")
print(f"KS: {val_ks:.4f}")


AUC: 0.7993
Gini: 0.5986
KS: 0.5120


Step 9: Explainability with SHAP

In [None]:
# Take small sample for speed
sample_X = X_val.sample(200, random_state=RANDOM_STATE)
sample_X_trans = best_model.named_steps["preprocessor"].transform(sample_X)
model = best_model.named_steps["xgb"]
booster = model.get_booster()

explainer = shap.TreeExplainer(booster)
shap_values = explainer.shap_values(sample_X_trans)

# Feature names (after preprocessing)
ohe = best_model.named_steps["preprocessor"].named_transformers_["cat"].named_steps["onehot"]
cat_feature_names = ohe.get_feature_names_out(categorical_features)
feature_names = list(numeric_features) + list(cat_feature_names)

shap.summary_plot(shap_values, sample_X_trans, feature_names=feature_names)


Step 10: Save and load model

In [None]:
os.makedirs("models", exist_ok=True)
joblib.dump(best_model, "models/credit_risk_xgb.pkl")
print("✅ Model saved to models/credit_risk_xgb.pkl")

# To load later:
# model = joblib.load("models/credit_risk_xgb.pkl")


✅ Model saved to models/credit_risk_xgb.pkl


Step 11: Predict for new applicants

In [None]:
new_applicants = X_test.sample(5, random_state=RANDOM_STATE)
new_scores = best_model.predict_proba(new_applicants)[:, 1]
pd.DataFrame({"Applicant_ID": range(1, 6), "Risk_Score": new_scores})


Unnamed: 0,Applicant_ID,Risk_Score
0,1,0.199965
1,2,0.169328
2,3,0.110342
3,4,0.291971
4,5,0.482185
