In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# --- Preprocessing & Modeling ---
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
#   from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

# --- Models ---
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from xgboost import XGBClassifier

# --- Evaluation ---
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

In [4]:
import pandas as pd
df=pd.read_csv('../Loan_default.csv')

In [5]:
print("Dataset shape:", df.shape)
print("\nDataset info:")
df.info()
print("\nFirst 5 rows:")
df.head()

Dataset shape: (148670, 34)

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148670 entries, 0 to 148669
Data columns (total 34 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   ID                         148670 non-null  int64  
 1   year                       148670 non-null  int64  
 2   loan_limit                 145326 non-null  object 
 3   Gender                     148670 non-null  object 
 4   approv_in_adv              147762 non-null  object 
 5   loan_type                  148670 non-null  object 
 6   loan_purpose               148536 non-null  object 
 7   Credit_Worthiness          148670 non-null  object 
 8   open_credit                148670 non-null  object 
 9   business_or_commercial     148670 non-null  object 
 10  loan_amount                148670 non-null  int64  
 11  rate_of_interest           112231 non-null  float64
 12  Interest_rate_spread       112031 non-null 

Unnamed: 0,ID,year,loan_limit,Gender,approv_in_adv,loan_type,loan_purpose,Credit_Worthiness,open_credit,business_or_commercial,...,credit_type,Credit_Score,co-applicant_credit_type,age,submission_of_application,LTV,Region,Security_Type,Status,dtir1
0,24890,2019,cf,Sex Not Available,nopre,type1,p1,l1,nopc,nob/c,...,EXP,758,CIB,25-34,to_inst,98.728814,south,direct,1,45.0
1,24891,2019,cf,Male,nopre,type2,p1,l1,nopc,b/c,...,EQUI,552,EXP,55-64,to_inst,,North,direct,1,
2,24892,2019,cf,Male,pre,type1,p1,l1,nopc,nob/c,...,EXP,834,CIB,35-44,to_inst,80.019685,south,direct,0,46.0
3,24893,2019,cf,Male,nopre,type1,p4,l1,nopc,nob/c,...,EXP,587,CIB,45-54,not_inst,69.3769,North,direct,0,42.0
4,24894,2019,cf,Joint,pre,type1,p1,l1,nopc,nob/c,...,CRIF,602,EXP,25-34,not_inst,91.886544,North,direct,0,39.0


In [6]:
# List of columns to drop
cols_to_drop = [
    "ID",
    "year",
    "loan_limit",
    "open_credit",
    "business_or_commercial",
    "co-applicant_credit_type",
    "construction_type",
    "total_units",
    "Region",
    "Security_Type"
]

# Drop columns safely (ignore if some are missing)
df = df.drop(columns=cols_to_drop, errors='ignore')

print("Remaining columns:", df.columns.tolist())


Remaining columns: ['Gender', 'approv_in_adv', 'loan_type', 'loan_purpose', 'Credit_Worthiness', 'loan_amount', 'rate_of_interest', 'Interest_rate_spread', 'Upfront_charges', 'term', 'Neg_ammortization', 'interest_only', 'lump_sum_payment', 'property_value', 'occupancy_type', 'Secured_by', 'income', 'credit_type', 'Credit_Score', 'age', 'submission_of_application', 'LTV', 'Status', 'dtir1']


In [7]:
print(f"Original unique values in 'Status' column: {df['Status'].unique()}")

Original unique values in 'Status' column: [1 0]


In [8]:
df['Status'] = pd.to_numeric(df['Status'], errors='coerce')

In [9]:
valid_statuses = [0, 1]
df = df[df['Status'].isin(valid_statuses)]
print(f"Shape after keeping only rows with Status of 0 or 1: {df.shape}")

Shape after keeping only rows with Status of 0 or 1: (148670, 24)


In [10]:
df['Status'] = df['Status'].astype(int)

In [11]:
#daatset splitting
X = df.drop('Status', axis=1)
y = df['Status']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

print("\nData successfully split.")
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print("-" * 50)



Data successfully split.
X_train shape: (111502, 23)
X_test shape: (37168, 23)
--------------------------------------------------


In [12]:
numerical_cols = X_train.select_dtypes(include=np.number).columns.tolist()
categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

numerical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='passthrough'
)

print("\nPreprocessing pipeline built successfully.")
print("-" * 50)



Preprocessing pipeline built successfully.
--------------------------------------------------


In [11]:
scale_pos_weight_value = y_train.value_counts()[0] / y_train.value_counts()[1]

models = {
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=10),
    "Gaussian Naive Bayes": GaussianNB(),
    "Bernoulli Naive Bayes": BernoulliNB(),
    "Decision Tree": DecisionTreeClassifier(random_state=42)
}

for name, model in models.items():
    print(f"\n--- Training and Evaluating: {name} ---")

    # CORRECT: This line is now aligned with the print statement above it.
    model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('smote', SMOTE(random_state=42)),
        ('classifier', model)
    ])

    # All subsequent lines in the loop must also be aligned
    model_pipeline.fit(X_train, y_train)
    y_pred = model_pipeline.predict(X_test)

    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f} (Note: Can be misleading!)")
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification Report (for test data):")
    print(classification_report(y_test, y_pred))
    print("-" * 50)


--- Training and Evaluating: Logistic Regression ---
Accuracy: 0.8411 (Note: Can be misleading!)

Confusion Matrix:
[[24849  3159]
 [ 2748  6412]]

Classification Report (for test data):
              precision    recall  f1-score   support

           0       0.90      0.89      0.89     28008
           1       0.67      0.70      0.68      9160

    accuracy                           0.84     37168
   macro avg       0.79      0.79      0.79     37168
weighted avg       0.84      0.84      0.84     37168

--------------------------------------------------

--- Training and Evaluating: Random Forest ---
Accuracy: 1.0000 (Note: Can be misleading!)

Confusion Matrix:
[[28008     0]
 [    0  9160]]

Classification Report (for test data):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28008
           1       1.00      1.00      1.00      9160

    accuracy                           1.00     37168
   macro avg       1.00      1.00 

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.9999 (Note: Can be misleading!)

Confusion Matrix:
[[28003     5]
 [    0  9160]]

Classification Report (for test data):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28008
           1       1.00      1.00      1.00      9160

    accuracy                           1.00     37168
   macro avg       1.00      1.00      1.00     37168
weighted avg       1.00      1.00      1.00     37168

--------------------------------------------------

--- Training and Evaluating: K-Nearest Neighbors ---
Accuracy: 0.9054 (Note: Can be misleading!)

Confusion Matrix:
[[24966  3042]
 [  475  8685]]

Classification Report (for test data):
              precision    recall  f1-score   support

           0       0.98      0.89      0.93     28008
           1       0.74      0.95      0.83      9160

    accuracy                           0.91     37168
   macro avg       0.86      0.92      0.88     37168
weighted avg       0.92     

In [12]:
# This list will store the results for our final summary table.
results_list = []

# --- Step 2: Train Each Model and Evaluate on the Test Set ---
print("--- Starting Final Evaluation on the Unseen Test Data ---")
for name, model in models.items():
    print(f"\n--- Training and Evaluating: {name} ---")

    # Create a pipeline with the preprocessor and the current model
    model_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])

    # Train the model on the full training data
    model_pipeline.fit(X_train, y_train)

    # Make predictions on the unseen test data
    y_pred = model_pipeline.predict(X_test)

    # --- Generate and Print Full Reports ---
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(f"\nAccuracy: {accuracy:.4f} ")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")

    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification Report (for test data):")
    print(classification_report(y_test, y_pred))
    print("-" * 50)

    # Store results in the list
    results_list.append({
        'Model': name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-score': f1
    })

print("\n--- Final Evaluation Complete ---")

# You can now convert results_list to a DataFrame for easy viewing
results_df = pd.DataFrame(results_list)
display(results_df.sort_values(by='F1-score', ascending=False))

--- Starting Final Evaluation on the Unseen Test Data ---

--- Training and Evaluating: Logistic Regression ---

Accuracy: 0.8719 
Precision: 0.9386
Recall: 0.5139
F1-score: 0.6641

Confusion Matrix:
[[27700   308]
 [ 4453  4707]]

Classification Report (for test data):
              precision    recall  f1-score   support

           0       0.86      0.99      0.92     28008
           1       0.94      0.51      0.66      9160

    accuracy                           0.87     37168
   macro avg       0.90      0.75      0.79     37168
weighted avg       0.88      0.87      0.86     37168

--------------------------------------------------

--- Training and Evaluating: Random Forest ---

Accuracy: 1.0000 
Precision: 0.9999
Recall: 1.0000
F1-score: 0.9999

Confusion Matrix:
[[28007     1]
 [    0  9160]]

Classification Report (for test data):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28008
           1       1.00      1.00  

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Accuracy: 0.9999 
Precision: 0.9998
Recall: 1.0000
F1-score: 0.9999

Confusion Matrix:
[[28006     2]
 [    0  9160]]

Classification Report (for test data):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28008
           1       1.00      1.00      1.00      9160

    accuracy                           1.00     37168
   macro avg       1.00      1.00      1.00     37168
weighted avg       1.00      1.00      1.00     37168

--------------------------------------------------

--- Training and Evaluating: K-Nearest Neighbors ---

Accuracy: 0.9346 
Precision: 0.9283
Recall: 0.7963
F1-score: 0.8573

Confusion Matrix:
[[27445   563]
 [ 1866  7294]]

Classification Report (for test data):
              precision    recall  f1-score   support

           0       0.94      0.98      0.96     28008
           1       0.93      0.80      0.86      9160

    accuracy                           0.93     37168
   macro avg       0.93      0.8

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-score
6,Decision Tree,1.0,1.0,1.0,1.0
1,Random Forest,0.999973,0.999891,1.0,0.999945
2,XGBoost,0.999946,0.999782,1.0,0.999891
3,K-Nearest Neighbors,0.934648,0.928344,0.796288,0.85726
5,Bernoulli Naive Bayes,0.910999,0.864838,0.757205,0.807451
4,Gaussian Naive Bayes,0.894963,1.0,0.573799,0.72919
0,Logistic Regression,0.871906,0.938584,0.513865,0.664127


In [13]:
# --- ONE-TIME SCRIPT TO TRAIN AND SAVE PIPELINES ---
import joblib
import os

# The directory where we'll save the models (relative to your notebook location)
save_path = '../streamlit_app/models/'

# Ensure the directory exists
os.makedirs(save_path, exist_ok=True)

print("--- Starting Training and Serialization ---")

# Use the same models dictionary from above
for name, model in models.items():
    print(f"Training pipeline for: {name}")

    model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('smote', SMOTE(random_state=42)),
        ('classifier', model)
    ])

    # Train the pipeline on the ENTIRE training dataset
    model_pipeline.fit(X_train, y_train)

    # Sanitize the filename (replace spaces, convert to lowercase)
    filename = f"{name.lower().replace(' ', '_')}_pipeline.joblib"

    # Save the pipeline to the specified directory
    joblib.dump(model_pipeline, save_path + filename)
    print(f"  -> Saved pipeline to {save_path}{filename}\n")

print("--- All model pipelines have been trained and saved successfully! ---")
print(f"Models saved in: {os.path.abspath(save_path)}")

# List the saved files to verify
saved_files = [f for f in os.listdir(save_path) if f.endswith('.joblib')]
print(f"\nSaved model files ({len(saved_files)}):")
for file in saved_files:
    print(f"  - {file}")

--- Starting Training and Serialization ---
Training pipeline for: Logistic Regression
  -> Saved pipeline to ../streamlit_app/models/logistic_regression_pipeline.joblib

Training pipeline for: Random Forest
  -> Saved pipeline to ../streamlit_app/models/random_forest_pipeline.joblib

Training pipeline for: XGBoost


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


  -> Saved pipeline to ../streamlit_app/models/xgboost_pipeline.joblib

Training pipeline for: K-Nearest Neighbors
  -> Saved pipeline to ../streamlit_app/models/k-nearest_neighbors_pipeline.joblib

Training pipeline for: Gaussian Naive Bayes
  -> Saved pipeline to ../streamlit_app/models/gaussian_naive_bayes_pipeline.joblib

Training pipeline for: Bernoulli Naive Bayes
  -> Saved pipeline to ../streamlit_app/models/bernoulli_naive_bayes_pipeline.joblib

Training pipeline for: Decision Tree
  -> Saved pipeline to ../streamlit_app/models/decision_tree_pipeline.joblib

--- All model pipelines have been trained and saved successfully! ---
Models saved in: c:\Users\pavan\CreditPathAI\CreditPathAI\streamlit_app\models

Saved model files (7):
  - bernoulli_naive_bayes_pipeline.joblib
  - decision_tree_pipeline.joblib
  - gaussian_naive_bayes_pipeline.joblib
  - k-nearest_neighbors_pipeline.joblib
  - logistic_regression_pipeline.joblib
  - random_forest_pipeline.joblib
  - xgboost_pipeline.j