In [None]:
# ==========================================
# PHASE 1: LOAD DATA (Manual Upload Method)
# ==========================================
import pandas as pd
import os

filename = 'loan_dataset_20000.csv'

# Check if you dragged the file over correctly
if os.path.exists(filename):
    print(f"Success! Found {filename}...")
    df = pd.read_csv(filename)

    print("\n--- COLUMN NAMES ---")
    print(df.columns.tolist())

    print("\n--- DATA PREVIEW ---")
    print(df.head())
else:
    print(f"ERROR: Could not find '{filename}'.")
    print("Please open the Folder icon (left sidebar) and drag-and-drop your CSV file there first.")

Success! Found loan_dataset_20000.csv...

--- COLUMN NAMES ---
['age', 'gender', 'marital_status', 'education_level', 'annual_income', 'monthly_income', 'employment_status', 'debt_to_income_ratio', 'credit_score', 'loan_amount', 'loan_purpose', 'interest_rate', 'loan_term', 'installment', 'grade_subgrade', 'num_of_open_accounts', 'total_credit_limit', 'current_balance', 'delinquency_history', 'public_records', 'num_of_delinquencies', 'loan_paid_back']

--- DATA PREVIEW ---
   age  gender marital_status education_level  annual_income  monthly_income  \
0   59    Male        Married        Master's       24240.19         2020.02   
1   72  Female        Married      Bachelor's       20172.98         1681.08   
2   49  Female         Single     High School       26181.80         2181.82   
3   35  Female         Single     High School       11873.84          989.49   
4   63   Other         Single           Other       25326.44         2110.54   

  employment_status  debt_to_income_ratio

In [None]:
# ==========================================
# PHASE 2: DATA PREPROCESSING
# ==========================================

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# 1. Loads the Data
# df = pd.read_csv('loan_dataset_20000.csv')

# Task A: Classification Target (e.g., 'loan_status', 'default')
target_classification = 'loan_paid_back'

# Task B: Regression Target (e.g., 'loan_amount', 'int_rate')
target_regression = 'loan_amount'

# Numeric Features (e.g., 'annual_inc', 'dti', 'open_acc')
# Do NOT include the target columns here
num_features = ['age', 'annual_income', 'monthly_income', 'debt_to_income_ratio', 'credit_score', 'interest_rate', 'loan_term', 'installment', 'num_of_open_accounts', 'total_credit_limit', 'current_balance', 'delinquency_history', 'public_records', 'num_of_delinquencies']

# Categorical Features (e.g., 'grade', 'home_ownership')
cat_features = ['gender', 'marital_status', 'education_level', 'employment_status', 'loan_purpose', 'grade_subgrade']
# -------------------------------------------------------------------------


# 2. Define Preprocessing Pipelines
# Numeric Pipeline: Fills missing values with median, then scales the data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Categorical Pipeline: Fills missing values with most frequent, then One-Hot Encodes
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combines the into a single Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_features),
        ('cat', categorical_transformer, cat_features)
    ],
    remainder='drop'
)


# 3. Data Splitting for Classification Task
print("--- Splitting Data for Classification ---")
# Drop both targets from features to avoid data leakage
X_class = df.drop(columns=[target_classification, target_regression])
y_class = df[target_classification]

X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
    X_class, y_class, test_size=0.2, random_state=42
)
print(f"Classification Train Shape: {X_train_c.shape}")
print(f"Classification Test Shape:  {X_test_c.shape}")


# 4. Data Splitting for Regression Task
print("\n--- Splitting Data for Regression ---")
# Drop both targets from features
X_reg = df.drop(columns=[target_classification, target_regression])
y_reg = df[target_regression]

X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)
print(f"Regression Train Shape: {X_train_r.shape}")
print(f"Regression Test Shape:  {X_test_r.shape}")

print("\nPhase 2 Complete. Data is cleaned, scaled, and split.")


--- Splitting Data for Classification ---
Classification Train Shape: (16000, 20)
Classification Test Shape:  (4000, 20)

--- Splitting Data for Regression ---
Regression Train Shape: (16000, 20)
Regression Test Shape:  (4000, 20)

Phase 2 Complete. Data is cleaned, scaled, and split.


In [None]:
# ==========================================
# PHASE 3: CLASSICAL ML MODELS
# ==========================================

from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# --- PART A: CLASSIFICATION MODELS (Predicting Risk) ---
print("=== PHASE 3A: CLASSIFICATION RESULTS ===")

# 1. Defines the pipeline with Logistic Regression
clf_pipeline_log = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(solver='liblinear', random_state=42, max_iter=1000))
])

# Train and Predict
clf_pipeline_log.fit(X_train_c, y_train_c)
y_pred_log = clf_pipeline_log.predict(X_test_c)

# Evaluate Logistic Regression
print("\n--- Model 1: Logistic Regression ---")
print(f"Accuracy: {accuracy_score(y_test_c, y_pred_log):.4f}")
print(f"F1 Score: {f1_score(y_test_c, y_pred_log, average='weighted'):.4f}")
print("Confusion Matrix:\n", confusion_matrix(y_test_c, y_pred_log))


# 2. Define Pipeline with Random Forest
clf_pipeline_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Trains and Predicts
clf_pipeline_rf.fit(X_train_c, y_train_c)
y_pred_rf = clf_pipeline_rf.predict(X_test_c)

# Evaluate Random Forest
print("\n--- Model 2: Random Forest Classifier ---")
print(f"Accuracy: {accuracy_score(y_test_c, y_pred_rf):.4f}")
print(f"F1 Score: {f1_score(y_test_c, y_pred_rf, average='weighted'):.4f}")
print("Confusion Matrix:\n", confusion_matrix(y_test_c, y_pred_rf))


# --- PART B: REGRESSION MODELS (Predicting Financial Value) ---
print("\n=== PHASE 3B: REGRESSION RESULTS ===")

# 3. Defines the pipeline with Ridge Regression (Linear Model)
reg_pipeline_ridge = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Ridge(random_state=42))
])

# Trains and Predicts
reg_pipeline_ridge.fit(X_train_r, y_train_r)
y_pred_ridge = reg_pipeline_ridge.predict(X_test_r)

# Evaluate Ridge Regression
print("\n--- Model 3: Ridge Regression ---")
print(f"MAE: {mean_absolute_error(y_test_r, y_pred_ridge):.2f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test_r, y_pred_ridge)):.2f}")
print(f"R² Score: {r2_score(y_test_r, y_pred_ridge):.4f}")


# 4. Define Pipeline with Gradient Boosting
reg_pipeline_gb = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(n_estimators=100, random_state=42))
])

# Trains and Predicts
reg_pipeline_gb.fit(X_train_r, y_train_r)
y_pred_gb = reg_pipeline_gb.predict(X_test_r)

# Evaluate Gradient Boosting
print("\n--- Model 4: Gradient Boosting Regressor ---")
print(f"MAE: {mean_absolute_error(y_test_r, y_pred_gb):.2f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test_r, y_pred_gb)):.2f}")
print(f"R² Score: {r2_score(y_test_r, y_pred_gb):.4f}")

=== PHASE 3A: CLASSIFICATION RESULTS ===

--- Model 1: Logistic Regression ---
Accuracy: 0.8808
F1 Score: 0.8727
Confusion Matrix:
 [[ 460  358]
 [ 119 3063]]

--- Model 2: Random Forest Classifier ---
Accuracy: 0.8935
F1 Score: 0.8817
Confusion Matrix:
 [[ 429  389]
 [  37 3145]]

=== PHASE 3B: REGRESSION RESULTS ===

--- Model 3: Ridge Regression ---
MAE: 985.40
RMSE: 1477.24
R² Score: 0.9713

--- Model 4: Gradient Boosting Regressor ---
MAE: 186.57
RMSE: 260.07
R² Score: 0.9991


In [None]:
# =========================================================================
# PHASE 4: CLASSIFICATION - Model B: Tuned Random Forest Classifier
#
# =========================================================================

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
import joblib

print("\n--- Model B: Random Forest Classifier with Hyperparameter Tuning ---")

# 1. Define the Classification Pipeline (using the existing 'preprocessor' object)
clf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# 2. Defines the parameter search space for the classifier
param_grid_c = {
    'classifier__n_estimators': [50, 100, 150],
    'classifier__max_depth': [5, 10, None],
    'classifier__min_samples_split': [2, 5]
}

# 3. Performs the Grid Search
grid_search_c = GridSearchCV(clf_pipeline, param_grid_c, cv=3, scoring='f1_weighted', n_jobs=-1)
grid_search_c.fit(X_train_c, y_train_c)

print(f"Best Parameters: {grid_search_c.best_params_}")

# 4. Evaluates which is the Best Model and Calculate the needed metrics
best_rf_model = grid_search_c.best_estimator_

# Get predictions and probabilities
y_pred_tuned = best_rf_model.predict(X_test_c)

y_proba_tuned = best_rf_model.predict_proba(X_test_c)[:, 1]

# Calculate all required Classical ML metrics
accuracy_tuned = accuracy_score(y_test_c, y_pred_tuned)
f1_tuned = f1_score(y_test_c, y_pred_tuned, average='weighted', zero_division=0)
precision_tuned = precision_score(y_test_c, y_pred_tuned, average='weighted', zero_division=0)
recall_tuned = recall_score(y_test_c, y_pred_tuned, average='weighted', zero_division=0)
# AUC-ROC for binary classification
auc_roc_tuned = roc_auc_score(y_test_c, y_proba_tuned)

print("\n--- Tuned Random Forest Metrics (Full Set) ---")
print(f"Tuned Accuracy: {accuracy_tuned:.4f}")
print(f"Tuned F1-Score: {f1_tuned:.4f}")
print(f"Tuned Precision: {precision_tuned:.4f}")
print(f"Tuned Recall: {recall_tuned:.4f}")
print(f"Tuned AUC-ROC: {auc_roc_tuned:.4f}")


# =========================================================================
# Preparation for Phase 5
# =========================================================================
joblib.dump(best_rf_model, 'best_loan_classifier_pipeline.joblib')
print("\n[SUCCESS] Saved best model pipeline as 'best_loan_classifier_pipeline.joblib' for Phase 6 Deployment.")



--- Model B: Random Forest Classifier with Hyperparameter Tuning ---
Best Parameters: {'classifier__max_depth': None, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 100}

--- Tuned Random Forest Metrics (Full Set) ---
Tuned Accuracy: 0.8935
Tuned F1-Score: 0.8817
Tuned Precision: 0.8962
Tuned Recall: 0.8935
Tuned AUC-ROC: 0.8759

[SUCCESS] Saved best model pipeline as 'best_loan_classifier_pipeline.joblib' for Phase 6 Deployment.


In [None]:
# =========================================================================
# PHASE 5: DEEP LEARNING MODEL (Classification Task)
# =========================================================================

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
import pandas as pd
import numpy as np

print("\n--- Phase 5: Deep Learning Model (Multi-Layer Perceptron) ---")

# --- 0. Setup: Re-calculate Classical Metrics for Comparison ---
# If you didn't run Phase 4, replace 'grid_search_c' with 'clf_pipeline_rf' (from Phase 3).
try:
    best_rf_model = grid_search_c.best_estimator_
    print("Using Tuned Random Forest from Phase 4 for comparison.")
except NameError:
    # Fallback if Phase 4 wasn't run
    best_rf_model = clf_pipeline_rf
    best_rf_model.fit(X_train_c, y_train_c)
    print("Using Standard Random Forest from Phase 3 for comparison.")

# Generate predictions for the Classical Model
y_pred_tuned = best_rf_model.predict(X_test_c)
y_prob_tuned = best_rf_model.predict_proba(X_test_c)[:, 1]

# Save metrics to variables
accuracy_tuned = accuracy_score(y_test_c, y_pred_tuned)
f1_tuned = f1_score(y_test_c, y_pred_tuned, average='weighted')
precision_tuned = precision_score(y_test_c, y_pred_tuned, average='weighted', zero_division=0)
recall_tuned = recall_score(y_test_c, y_pred_tuned, average='weighted')
auc_roc_tuned = roc_auc_score(y_test_c, y_prob_tuned)


# --- 1. Preprocess and Transform Data for Keras ---
# We fit the preprocessor explicitly to ensure it's ready
preprocessor.fit(X_train_c, y_train_c)

X_train_processed = preprocessor.transform(X_train_c)
X_test_processed = preprocessor.transform(X_test_c)

# Keras requires integer labels (0/1).
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train_c)
y_test_encoded = label_encoder.transform(y_test_c)

input_dim = X_train_processed.shape[1] # Number of features

# --- 2. Define the Keras Model (MLP Architecture) ---
dl_model = Sequential([
    Dense(64, activation='relu', input_shape=(input_dim,)),
    Dropout(0.3), # Regularization to prevent overfitting
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

# --- 3. Compile the Model ---
dl_model.compile(optimizer=Adam(learning_rate=0.001),
                 loss='binary_crossentropy',
                 metrics=['accuracy'])

# --- 4. Train the Model ---
print("\nTraining Deep Learning Model...")
history = dl_model.fit(
    X_train_processed,
    y_train_encoded,
    epochs=20,
    batch_size=32,
    validation_split=0.1,
    verbose=1
)
print("Deep Learning Model Training Complete.")

# --- 5. Evaluate the Model ---
y_pred_probs_dl = dl_model.predict(X_test_processed, verbose=0)
y_pred_dl = (y_pred_probs_dl > 0.5).astype("int32")

# Calculate DL Metrics
accuracy_dl = accuracy_score(y_test_encoded, y_pred_dl)
f1_dl = f1_score(y_test_encoded, y_pred_dl, average='weighted', zero_division=0)
precision_dl = precision_score(y_test_encoded, y_pred_dl, average='weighted', zero_division=0)
recall_dl = recall_score(y_test_encoded, y_pred_dl, average='weighted', zero_division=0)
auc_roc_dl = roc_auc_score(y_test_encoded, y_pred_probs_dl)

# --- 6. Final Comparison ---
print("\n--- Performance Comparison (Deep Learning vs. Classical ML) ---")
comparison_data = {
    'Metric': ['Accuracy', 'F1-Score', 'Precision', 'Recall', 'AUC-ROC'],
    'Random Forest': [f"{accuracy_tuned:.4f}", f"{f1_tuned:.4f}", f"{precision_tuned:.4f}", f"{recall_tuned:.4f}", f"{auc_roc_tuned:.4f}"],
    'Deep Learning': [f"{accuracy_dl:.4f}", f"{f1_dl:.4f}", f"{precision_dl:.4f}", f"{recall_dl:.4f}", f"{auc_roc_dl:.4f}"]
}
comparison_df = pd.DataFrame(comparison_data)
print(comparison_df.to_markdown(index=False))

print("\n--- Interpretability Discussion ---")
print("Classical models (Random Forest) offer clear feature importance ('white box').")
print("Deep Learning models (MLP) are 'black box' but may capture complex non-linear patterns.")


--- Phase 5: Deep Learning Model (Multi-Layer Perceptron) ---
Using Tuned Random Forest from Phase 4 for comparison.


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



Training Deep Learning Model...
Epoch 1/20
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.7533 - loss: 0.5044 - val_accuracy: 0.8925 - val_loss: 0.2871
Epoch 2/20
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.8762 - loss: 0.3275 - val_accuracy: 0.9000 - val_loss: 0.2686
Epoch 3/20
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 14ms/step - accuracy: 0.8919 - loss: 0.2971 - val_accuracy: 0.9056 - val_loss: 0.2612
Epoch 4/20
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 11ms/step - accuracy: 0.8972 - loss: 0.2842 - val_accuracy: 0.9050 - val_loss: 0.2569
Epoch 5/20
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 12ms/step - accuracy: 0.8990 - loss: 0.2675 - val_accuracy: 0.9050 - val_loss: 0.2611
Epoch 6/20
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.8994 - loss: 0.2735 - val_accuracy: 0.9038 - val_loss:

In [12]:
# =========================================================================
# 7. MODEL SAVING (Deployment Prep)
# =========================================================================
import joblib
from google.colab import files

print("\n=== SAVING AND DOWNLOADING MODELS ===")

# 1. Saves the Classification Model (Random Forest)
# Using the best_estimator_ from the grid search in Phase 4
joblib.dump(grid_search_c.best_estimator_, 'classification_model.pkl')
print("Saved: classification_model.pkl")

# 2. Saves the Regression Model (Gradient Boosting)
# Using the best performing regression model from Phase 3
joblib.dump(reg_pipeline_gb, 'regression_model.pkl')
print("Saved: regression_model.pkl")

# 3. Saves the Deep Learning Model (Keras)
dl_model.save('deep_learning_model.h5')
print("Saved: deep_learning_model.h5")

# 4. Initiate the model downloads
try:
    files.download('classification_model.pkl')
    files.download('regression_model.pkl')
    files.download('deep_learning_model.h5')
    print("Downloads started! Check your browser's download bar.")
except Exception as e:
    print(f"Could not automatically download. Please check the 'Files' sidebar on the left. Error: {e}")




=== SAVING AND DOWNLOADING MODELS ===
Saved: classification_model.pkl
Saved: regression_model.pkl
Saved: deep_learning_model.h5


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloads started! Check your browser's download bar.
