# Credit Risk Model Training

This notebook trains a machine learning model to predict credit default risk.

## Steps:
1. Load Data
2. Preprocessing (Cleaning, Encoding, Scaling)
3. Model Training (XGBoost, CatBoost, etc.)
4. Evaluation

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report

from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [15]:

df = pd.read_csv("processed_dataset.csv")
print("Loaded processed_dataset.csv")

print(f"Shape: {df.shape}")
df.head()

Loaded processed_dataset.csv
Shape: (89999, 56)


Unnamed: 0,id,application_id,application_hour,application_day_of_week,account_open_year,preferred_contact,referral_code,account_status_code,random_noise_1,num_login_sessions,...,annual_income,employment_length,employment_type,education,marital_status,disposable_income,loan_to_income_ratio,utilization_ratio,oldest_credit_line_years,income_per_age
0,10000,620515,5,6,2013,Email,REF0000,ACT-2,1.137,13,...,61800.0,2.2,full-time,Graduate,Married,3819.23,0.286408,0.841,22.8,1507.317073
1,10001,624978,4,2,2015,Phone,REF0000,ACT-3,-0.165,6,...,28600.0,7.0,full-time,High School,Married,977.26,3.986014,0.971,3.5,752.631579
2,10002,564658,10,3,2020,Phone,REF0000,ACT-3,0.527,1,...,20700.0,0.8,full-time,Bachelor,Single,1203.12,0.449275,0.539,0.0,1150.0
3,10003,621493,7,5,2010,Email,REF0000,A01,-0.71,4,...,31400.0,4.8,full-time,Bachelor,Single,2093.44,0.27707,0.147,9.0,1162.962963
4,10004,637785,1,2,2020,Email,REF0000,ACT-3,-0.603,6,...,24600.0,5.2,full-time,High School,Single,1466.42,0.292683,0.488,8.0,946.153846


In [16]:
TARGET = 'default'
if TARGET in df.columns:
    print(df[TARGET].value_counts(normalize=True))
else:
    print(f"Warning: Target '{TARGET}' not found in DataFrame columns: {df.columns.tolist()}")

default
0    0.948955
1    0.051045
Name: proportion, dtype: float64


In [31]:
X = df.drop(columns=[TARGET, 'id', 'application_id', 'random_noise_1'], errors='ignore')
y = df[TARGET]
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

print(f"Numeric Features: {len(numeric_features)}")
print(f"Categorical Features: {len(categorical_features)}")

Numeric Features: 44
Categorical Features: 8


In [34]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Train Shape: {X_train.shape}")
print(f"Test Shape: {X_test.shape}")

Train Shape: (71999, 52)
Test Shape: (18000, 52)


In [22]:
# Initialize Models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, n_jobs=-1),
    'CatBoost': CatBoostClassifier(verbose=0, random_state=42)
}

results = {}

for name, model in models.items():
    print(f"Training {name}...")
    
    # Create Full Pipeline
    clf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', model)])
    
    # Train
    clf.fit(X_train, y_train)
    
    # Predict
    y_pred = clf.predict(X_test)
    # Predict Proba if supported
    try:
        y_prob = clf.predict_proba(X_test)[:, 1]
    except:
        y_prob = None
    
    # Evaluate
    acc = accuracy_score(y_test, y_pred)
    roc = roc_auc_score(y_test, y_prob) if y_prob is not None else 0
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    results[name] = {"Accuracy": acc, "ROC-AUC": roc, "F1-Score": f1}
    
    print(f"  Accuracy: {acc:.4f}")
    print(f"  ROC-AUC:  {roc:.4f}")
    print(f"  F1-Score: {f1:.4f}")
    print(classification_report(y_test, y_pred))
    print("-"*30)


Training Logistic Regression...
  Accuracy: 0.9498
  ROC-AUC:  0.8059
  F1-Score: 0.9286
              precision    recall  f1-score   support

           0       0.95      1.00      0.97     17081
           1       0.62      0.04      0.08       919

    accuracy                           0.95     18000
   macro avg       0.79      0.52      0.53     18000
weighted avg       0.93      0.95      0.93     18000

------------------------------
Training Random Forest...
  Accuracy: 0.9494
  ROC-AUC:  0.7752
  F1-Score: 0.9256
              precision    recall  f1-score   support

           0       0.95      1.00      0.97     17081
           1       0.80      0.01      0.03       919

    accuracy                           0.95     18000
   macro avg       0.87      0.51      0.50     18000
weighted avg       0.94      0.95      0.93     18000

------------------------------
Training XGBoost...
  Accuracy: 0.9476
  ROC-AUC:  0.7742
  F1-Score: 0.9313
              precision    recall  

In [35]:
# Compare Models
results_df = pd.DataFrame(results).T
results_df.sort_values(by="ROC-AUC", ascending=False)

Unnamed: 0,Accuracy,ROC-AUC,F1-Score
CatBoost,0.949722,0.807102,0.929121
Logistic Regression,0.949833,0.805877,0.928629
Random Forest,0.949444,0.775219,0.92563
XGBoost,0.947556,0.77416,0.931285


In [36]:
pred_df = pd.DataFrame({'Actual Value': y_test, 'Predicted Value': y_pred, 'Difference': y_test - y_pred})
pred_df.head()


Unnamed: 0,Actual Value,Predicted Value,Difference
29219,0,0,0
14222,0,0,0
84117,0,0,0
74252,0,0,0
60072,0,0,0
