In [1]:
#Import Libraries
from joblib import Parallel, delayed
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, roc_curve, auc, confusion_matrix,precision_recall_fscore_support
import optuna
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind
import warnings
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv(r"D:\Work Station\mlproject\notebbok\dataset\Encoded_LoanExport.csv")
df.head()

Unnamed: 0,CreditScore,FirstPaymentDate,FirstTimeHomebuyer,MaturityDate,MSA,MIP,Units,Occupancy,OCLTV,DTI,...,PostalCode,LoanSeqNum,LoanPurpose,OrigLoanTerm,NumBorrowers,SellerName,ServicerName,EverDelinquent,MonthsDelinquent,MonthsInRepayment
0,0,199902,0,202901,357,25,1,1,89,27,...,470,86314,2,360,1,17,9,0,0,52
1,0,199902,0,202901,387,0,1,1,73,17,...,688,259731,1,360,0,18,13,0,0,144
2,0,199902,0,202901,110,0,1,1,75,16,...,531,85110,1,360,1,17,9,0,0,67
3,0,199902,0,202901,125,0,1,1,76,14,...,787,18846,1,360,1,2,2,0,0,35
4,0,199902,0,202901,169,0,1,1,78,18,...,637,19227,1,360,1,2,2,0,0,54


Target and Feature Columns

In [3]:
# Define target and features
target_column = 'MonthsInRepayment'
X = df.drop(target_column, axis=1)
y = df[target_column]

# Fill missing values or drop missing values
X.fillna(X.mean(), inplace=True)

Data Split into train, validation, and test sets

In [4]:
def split_data(X, y, test_size=0.2, val_size=0.2):
    X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=test_size, random_state=42, stratify=y)
    val_proportion = val_size / (1 - test_size)
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=val_proportion, random_state=42, stratify=y_train_val)
    return X_train, X_val, X_test, y_train, y_val, y_test

X_train, X_val, X_test, y_train, y_val, y_test = split_data(X, y)

Feature Scaling using Standard Scalar

In [5]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

Model Selection

In [6]:
# Define models
models = {
    "Logistic Regression": LogisticRegression(solver='liblinear', random_state=42),
    # "Support Vector Machine (SVM)": SVC(probability=True, random_state=42),
    "Gaussian Discriminant Analysis (GDA)": QDA(),
    "Feed Forward Neural Network": MLPClassifier(random_state=42)
}


Evaluate models and its classification report

Evaluating Models with time stamp and memory

In [7]:
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score, classification_report, confusion_matrix

def evaluate_model(name, model, X_val_scaled, y_val):
    # Fit the model
    model.fit(X_train_scaled, y_train)
    
    # Predict on validation set
    y_val_pred = model.predict(X_val_scaled)
    
    # Calculate probabilities if the model supports it
    y_val_proba = None
    if hasattr(model, "predict_proba"):
        y_val_proba = model.predict_proba(X_val_scaled)
    
    # Classification report
    report = classification_report(y_val, y_val_pred, output_dict=True)
    
    # Precision, Recall, F1-Score
    precision, recall, f1, _ = precision_recall_fscore_support(y_val, y_val_pred, average='weighted')
    
    # AUC ROC Score
    auc_roc = None
    if y_val_proba is not None:
        try:
            auc_roc = roc_auc_score(y_val, y_val_proba, multi_class='ovr')
        except ValueError:
            auc_roc = None  # Handle models that don't support probability prediction
    
    # Confusion Matrix
    conf_matrix = confusion_matrix(y_val, y_val_pred)
    
    return {
        'Model': name,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'AUC ROC': auc_roc,
        'Classification Report': report,
        'Confusion Matrix': conf_matrix
    }

#

In [8]:
results = []
for name, model in models.items():
    print(f"Evaluating {name}...")
    result = evaluate_model(name, model, X_val_scaled, y_val)
    results.append(result)
    print(f"Results for {name}:")
    print(f"Precision: {result['Precision']}")
    print(f"Recall: {result['Recall']}")
    print(f"F1 Score: {result['F1 Score']}")
    print(f"AUC ROC: {result['AUC ROC']}")
    print(f"Confusion Matrix:\n{result['Confusion Matrix']}")
    print(f"Classification Report:\n{result['Classification Report']}\n")


Evaluating Logistic Regression...
Results for Logistic Regression:
Precision: 0.020055255969820936
Recall: 0.04040144107050952
F1 Score: 0.02039007979714241
AUC ROC: 0.6771710891518091
Confusion Matrix:
[[  0   2   0 ...   0   1   4]
 [  1   1   2 ...   3   1   1]
 [  0   2   0 ...   6   2   1]
 ...
 [  2   0   1 ... 315  24   5]
 [  0   0   0 ... 137 111  18]
 [  0   0   0 ...  14  91  39]]
Classification Report:
{'1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 95.0}, '2': {'precision': 0.02631578947368421, 'recall': 0.005917159763313609, 'f1-score': 0.00966183574879227, 'support': 169.0}, '3': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 180.0}, '4': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 204.0}, '5': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 213.0}, '6': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 221.0}, '7': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 206.0}, '8': {'p

In [10]:
# Assuming you have lists of model names and their corresponding AUC ROC scores
model_names = [result['Model'] for result in results]
auc_roc_scores = [result['AUC ROC'] for result in results]

# Create a DataFrame and sort it by AUC ROC score in descending order
results_df = pd.DataFrame(list(zip(model_names, auc_roc_scores)), columns=['Model Name', 'AUC ROC']).sort_values(by=["AUC ROC"], ascending=False)

# Print the sorted DataFrame
print(results_df)


                             Model Name   AUC ROC
2           Feed Forward Neural Network  0.769239
0                   Logistic Regression  0.677171
1  Gaussian Discriminant Analysis (GDA)  0.529191


In [11]:
# Define the Feed Forward Neural Network model
best_model = MLPClassifier(random_state=42)

# Fit the model on the training set
best_model.fit(X_train_scaled, y_train)

# Predict on the test set
y_test_pred = best_model.predict(X_test_scaled)

# Predict probabilities for ROC AUC curve
y_test_proba = best_model.predict_proba(X_test_scaled)

# Evaluate the model on the test set
test_report = classification_report(y_test, y_test_pred)
print("Classification Report for Test Set:\n")
print(test_report)

# Calculate AUC ROC for the test set
try:
    auc_roc_test = roc_auc_score(y_test, y_test_proba, multi_class='ovr')
    print(f"AUC ROC Score for Test Set: {auc_roc_test:.4f}")
except ValueError:
    auc_roc_test = None  # Handle models that don't support probability prediction
    print("AUC ROC could not be calculated for the test set.")

Classification Report for Test Set:

              precision    recall  f1-score   support

           1       0.10      0.07      0.08        95
           2       0.07      0.06      0.06       170
           3       0.05      0.06      0.05       180
           4       0.05      0.17      0.08       204
           5       0.04      0.19      0.06       213
           6       0.04      0.02      0.03       220
           7       0.02      0.01      0.01       206
           8       0.00      0.00      0.00       217
           9       0.07      0.00      0.01       224
          10       0.00      0.00      0.00       205
          11       0.00      0.00      0.00       235
          12       0.03      0.00      0.01       265
          13       0.03      0.00      0.01       306
          14       0.03      0.00      0.01       323
          15       0.00      0.00      0.00       354
          16       0.08      0.01      0.01       400
          17       0.07      0.01      0.02 

In [12]:
# Confusion Matrix
conf_matrix_test = confusion_matrix(y_test, y_test_pred)
print("\nConfusion Matrix for Test Set:\n", conf_matrix_test)


Confusion Matrix for Test Set:
 [[  7   8   7 ...   1   1   0]
 [  6  10  10 ...   4   4   0]
 [  5   4  10 ...   4   3   4]
 ...
 [  0   1   1 ... 377  65   8]
 [  0   0   0 ... 116 216  39]
 [  0   0   1 ...   6 106  92]]
