In [2]:
# Data Manipulation Libraries
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

# Additional Libraries
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('../data/processed/preprocessed_data.csv')

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV


X = df.drop('Default', axis=1)  # Features
y = df['Default']                # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [5]:
import os
import pandas as pd

# Define the folder path
interim_folder = r"C:\Users\DELL\Desktop\FDM Mini Project\Automobile Loan Default Prediction\weighted method\data\interim"

# Make sure the folder exists
os.makedirs(interim_folder, exist_ok=True)

# Combine X_train and y_train into a single DataFrame
train_df = pd.concat([X_train, y_train], axis=1)

# Combine X_test and y_test into a single DataFrame
test_df = pd.concat([X_test, y_test], axis=1)

# Save as CSV
train_df.to_csv(os.path.join(interim_folder, "train.csv"), index=False)
test_df.to_csv(os.path.join(interim_folder, "test.csv"), index=False)

Random Forest Classifier

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib

# Define the model with class weighting
rf_model = RandomForestClassifier(class_weight={0: 1, 1: 10})

# Train the model
rf_model.fit(X_train, y_train)

# Evaluate the model
train_accuracy = accuracy_score(y_train, rf_model.predict(X_train))
test_accuracy = accuracy_score(y_test, rf_model.predict(X_test))

print("Random Forest Classifier:")
print("Training Accuracy:", train_accuracy)
print("Testing Accuracy:", test_accuracy)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

#  6. Save the trained model
joblib.dump(rf_model, 'C:/Users/DELL/Desktop/FDM Mini Project/Automobile Loan Default Prediction/weighted method/models/rf_model_weighted.pkl')
print("\n  Model saved as 'rf_model_weighted.pkl'")

Random Forest Classifier:
Training Accuracy: 0.9999100948499333
Testing Accuracy: 0.9115972430326641
Confusion Matrix:
 [[15194     6]
 [ 1469    16]]
Classification Report:
               precision    recall  f1-score   support

           0       0.91      1.00      0.95     15200
           1       0.73      0.01      0.02      1485

    accuracy                           0.91     16685
   macro avg       0.82      0.51      0.49     16685
weighted avg       0.90      0.91      0.87     16685


  Model saved as 'rf_model_weighted.pkl'


In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Define the model
rf_model_hyper = RandomForestClassifier()

# Define hyperparameters for tuning
param_grid = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 10, 15, 20],
    'min_samples_split': [2, 5, 10]
}

# Perform GridSearchCV
grid_search_rf = GridSearchCV(rf_model_hyper, param_grid, cv=5)
grid_search_rf.fit(X_train, y_train)

# Print the best hyperparameters
best_hyperparameters = grid_search_rf.best_params_
print("Best Hyperparameters (Random Forest):", best_hyperparameters)

# Evaluate the model
train_accuracy = accuracy_score(y_train, grid_search_rf.predict(X_train))
test_accuracy = accuracy_score(y_test, grid_search_rf.predict(X_test))

print("Random Forest Classifier:")
print("Training Accuracy:", train_accuracy)
print("Testing Accuracy:", test_accuracy)

# Make predictions on the test set using the best model found by GridSearchCV
y_pred = grid_search_rf.predict(X_test)

conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

Best Hyperparameters (Random Forest): {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}
Random Forest Classifier:
Training Accuracy: 0.9998801264665778
Testing Accuracy: 0.9115972430326641
Confusion Matrix:
 [[15194     6]
 [ 1469    16]]
Classification Report:
               precision    recall  f1-score   support

           0       0.91      1.00      0.95     15200
           1       0.73      0.01      0.02      1485

    accuracy                           0.91     16685
   macro avg       0.82      0.51      0.49     16685
weighted avg       0.90      0.91      0.87     16685



Support Vector Classification (SVC)

In [8]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Define the model with class weighting
svm_model = SVC(class_weight={0: 1, 1: 10})

# Train the model
svm_model.fit(X_train, y_train)

# Evaluate the model
train_accuracy = accuracy_score(y_train, svm_model.predict(X_train))
test_accuracy = accuracy_score(y_test, svm_model.predict(X_test))

print("SVM Classifier:")
print("Training Accuracy:", train_accuracy)
print("Testing Accuracy:", test_accuracy)

# Make predictions on the test set
y_pred = svm_model.predict(X_test)

conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

SVM Classifier:
Training Accuracy: 0.600776181128909
Testing Accuracy: 0.5922685046448907
Confusion Matrix:
 [[8983 6217]
 [ 586  899]]
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.59      0.73     15200
           1       0.13      0.61      0.21      1485

    accuracy                           0.59     16685
   macro avg       0.53      0.60      0.47     16685
weighted avg       0.87      0.59      0.68     16685



Naive Bayes

In [9]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Define the model (note: class_weight has no effect in GaussianNB)
nb_model = GaussianNB()

# Fit the model
nb_model.fit(X_train, y_train)

# Evaluate the model
train_accuracy = accuracy_score(y_train, nb_model.predict(X_train))
test_accuracy = accuracy_score(y_test, nb_model.predict(X_test))

print("Naive Bayes Classifier:")
print("Training Accuracy:", train_accuracy)
print("Testing Accuracy:", test_accuracy)

# Make predictions on the test set
y_pred = nb_model.predict(X_test)

conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

Naive Bayes Classifier:
Training Accuracy: 0.10815589553021562
Testing Accuracy: 0.10740185795624813
Confusion Matrix:
 [[  327 14873]
 [   20  1465]]
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.02      0.04     15200
           1       0.09      0.99      0.16      1485

    accuracy                           0.11     16685
   macro avg       0.52      0.50      0.10     16685
weighted avg       0.87      0.11      0.05     16685



In [10]:
from sklearn.naive_bayes import GaussianNB

# Define the model
nb_model_hyper = GaussianNB()

# No hyperparameters to tune for Gaussian Naive Bayes

# Fit the model
nb_model_hyper.fit(X_train, y_train)

# Evaluate the model
train_accuracy = accuracy_score(y_train, nb_model_hyper.predict(X_train))
test_accuracy = accuracy_score(y_test, nb_model_hyper.predict(X_test))

print("Naive Bayes Classifier:")
print("Training Accuracy:", train_accuracy)
print("Testing Accuracy:", test_accuracy)

# Make predictions on the test set
y_pred = nb_model_hyper.predict(X_test)

conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

Naive Bayes Classifier:
Training Accuracy: 0.10815589553021562
Testing Accuracy: 0.10740185795624813
Confusion Matrix:
 [[  327 14873]
 [   20  1465]]
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.02      0.04     15200
           1       0.09      0.99      0.16      1485

    accuracy                           0.11     16685
   macro avg       0.52      0.50      0.10     16685
weighted avg       0.87      0.11      0.05     16685



Gradient Boosting Classifier

In [11]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Compute weights for each sample
weights = compute_sample_weight(class_weight={0: 1, 1: 10}, y=y_train)

# Create model
gb_model = GradientBoostingClassifier()

# Fit with weights
gb_model.fit(X_train, y_train, sample_weight=weights)

# Evaluate
y_pred = gb_model.predict(X_test)

train_accuracy = accuracy_score(y_train, gb_model.predict(X_train))
test_accuracy = accuracy_score(y_test, y_pred)

print("Training Accuracy:", train_accuracy)
print("Testing Accuracy:", test_accuracy)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Training Accuracy: 0.6361988102551808
Testing Accuracy: 0.6255918489661373
Confusion Matrix:
 [[9552 5648]
 [ 599  886]]
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.63      0.75     15200
           1       0.14      0.60      0.22      1485

    accuracy                           0.63     16685
   macro avg       0.54      0.61      0.49     16685
weighted avg       0.87      0.63      0.71     16685



In [12]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Define the model
gb_model_hyper = GradientBoostingClassifier()

# Define hyperparameters for tuning
param_grid = {
    'n_estimators': [10, 50, 100],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

# Perform GridSearchCV
grid_search_gb = GridSearchCV(gb_model_hyper, param_grid, cv=5)
grid_search_gb.fit(X_train, y_train)

# Print the best hyperparameters
best_hyperparameters = grid_search_gb.best_params_
print("Best Hyperparameters (GradientBoostingClassifier):", best_hyperparameters)

# Evaluate the model using GridSearchCV results
train_accuracy = accuracy_score(y_train, grid_search_gb.predict(X_train))
test_accuracy = accuracy_score(y_test, grid_search_gb.predict(X_test))

print("Gradient Boosting Classifier:")
print("Training Accuracy:", train_accuracy)
print("Testing Accuracy:", test_accuracy)

# Make predictions on the test set using the best model found by GridSearchCV
y_pred = grid_search_gb.predict(X_test)

# Compute the confusion matrix and classification report
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

Best Hyperparameters (GradientBoostingClassifier): {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100}
Gradient Boosting Classifier:
Training Accuracy: 0.9110238698173427
Testing Accuracy: 0.9109979023074618
Confusion Matrix:
 [[15200     0]
 [ 1485     0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.91      1.00      0.95     15200
           1       0.00      0.00      0.00      1485

    accuracy                           0.91     16685
   macro avg       0.46      0.50      0.48     16685
weighted avg       0.83      0.91      0.87     16685



Decision Tree Classifier

In [13]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Define the model with class weighting
dt_model = DecisionTreeClassifier(class_weight={0: 1, 1: 10})

# Fit the model
dt_model.fit(X_train, y_train)

# Evaluate the model
train_accuracy = accuracy_score(y_train, dt_model.predict(X_train))
test_accuracy = accuracy_score(y_test, dt_model.predict(X_test))

print("Decision Tree Classifier:")
print("Training Accuracy:", train_accuracy)
print("Testing Accuracy:", test_accuracy)

# Make predictions on the test set
y_pred = dt_model.predict(X_test)

conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

Decision Tree Classifier:
Training Accuracy: 0.9999700316166444
Testing Accuracy: 0.8445909499550495
Confusion Matrix:
 [[13882  1318]
 [ 1275   210]]
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.91      0.91     15200
           1       0.14      0.14      0.14      1485

    accuracy                           0.84     16685
   macro avg       0.53      0.53      0.53     16685
weighted avg       0.85      0.84      0.85     16685



In [14]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Define the model
dt_model_hyper = DecisionTreeClassifier()

# Define hyperparameters for tuning
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform GridSearchCV
grid_search_dc = GridSearchCV(dt_model_hyper, param_grid, cv=5)
grid_search_dc.fit(X_train, y_train)

# Print the best hyperparameters
best_hyperparameters = grid_search_dc.best_params_
print("Best Hyperparameters (Decision Tree):", best_hyperparameters)

# Get the best model
best_dt_model = grid_search_dc.best_estimator_

# Evaluate the model
train_accuracy = accuracy_score(y_train, best_dt_model.predict(X_train))
test_accuracy = accuracy_score(y_test, best_dt_model.predict(X_test))

print("Decision Tree Classifier:")
print("Training Accuracy:", train_accuracy)
print("Testing Accuracy:", test_accuracy)

# Make predictions on the test set
y_pred = best_dt_model.predict(X_test)

# Compute the confusion matrix and classification report
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

Best Hyperparameters (Decision Tree): {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}
Decision Tree Classifier:
Training Accuracy: 0.9118779687429761
Testing Accuracy: 0.9095594845669763
Confusion Matrix:
 [[15170    30]
 [ 1479     6]]
Classification Report:
               precision    recall  f1-score   support

           0       0.91      1.00      0.95     15200
           1       0.17      0.00      0.01      1485

    accuracy                           0.91     16685
   macro avg       0.54      0.50      0.48     16685
weighted avg       0.84      0.91      0.87     16685

