In [19]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,roc_auc_score
import mlflow
import mlflow.sklearn
import numpy as np
from sklearn.metrics import classification_report
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import GridSearchCV

In [8]:
df1 = pd.read_csv("C:/ML_AI/loan/loan_acceptance_using_multiple-algorithm/data/ordinal_encode/train_processed_data_2025-11-08_07-46-24_PM.csv")

df2= pd.read_csv("C:/ML_AI/loan/loan_acceptance_using_multiple-algorithm/data/ordinal_encode/test_processed_data_2025-11-08_07-46-24_PM.csv")

In [9]:
xtrain=df1.drop(labels=["loan_status"],axis=1)
ytrain=df1[["loan_status"]]

In [10]:
xtest=df2.drop(labels=["loan_status"],axis=1)
ytest=df2[["loan_status"]]

In [11]:
rus = RandomUnderSampler(sampling_strategy=1.0, random_state=42)  
X_res, y_res = rus.fit_resample(xtrain, ytrain)

In [18]:
y_res.va

Unnamed: 0,loan_status
4023,0
25187,0
15235,0
21760,0
3321,0
...,...
46876,1
46898,1
46902,1
46905,1


In [12]:
mlflow.set_experiment("loan_approval_experiment_Logistic_Regression_OrdinalEncoding_sampling1")

C_values = [0.01, 0.1, 1, 10]
penalties = ["l1", "l2"]
solvers = ["liblinear", "saga"]

for C in C_values:
    for penalty in penalties:
        for solver in solvers:
            
            # skip incompatible combinations
            if penalty == "l1" and solver not in ["liblinear", "saga"]:
                continue

            with mlflow.start_run(run_name=f"logreg_C={C}_{penalty}_{solver}"):

                # Log params
                mlflow.log_param("C", C)
                mlflow.log_param("penalty", penalty)
                mlflow.log_param("solver", solver)
                mlflow.log_param("max_iter", 500)

                # Train model
                model = LogisticRegression(
                    C=C,
                    penalty=penalty,
                    solver=solver,
                    max_iter=500,
                    random_state=42
                )

                model.fit(X_res,y_res)

                # ---------------- Train Predictions ----------------
                y_pred_train = model.predict(X_res)
                y_prob_train = model.predict_proba(X_res)[:, 1]

                acc_train = accuracy_score(y_res, y_pred_train)
                roc_train = roc_auc_score(y_res, y_prob_train)

                # ---------------- Test Predictions ----------------
                y_pred_test = model.predict(xtest)
                y_prob_test = model.predict_proba(xtest)[:, 1]

                acc_test = accuracy_score(ytest, y_pred_test)
                roc_test = roc_auc_score(ytest, y_prob_test)

                # ---------------- Log metrics ----------------
                mlflow.log_metric("train_accuracy", acc_train)
                mlflow.log_metric("train_roc_auc", roc_train)
                mlflow.log_metric("test_accuracy", acc_test)
                mlflow.log_metric("test_roc_auc", roc_test)

                # ---------------- Confusion Matrices ----------------
                cm_train = confusion_matrix(y_res, y_pred_train)
                cm_test = confusion_matrix(ytest, y_pred_test)

                np.save("confusion_matrix_train.npy", cm_train)
                np.save("confusion_matrix_test.npy", cm_test)

                mlflow.log_artifact("confusion_matrix_train.npy", artifact_path="metrics")
                mlflow.log_artifact("confusion_matrix_test.npy", artifact_path="metrics")

                # ---------------- Log model ----------------
                mlflow.sklearn.log_model(model, artifact_path="model")

                print(f"✅ Run logged: C={C}, penalty={penalty}, solver={solver}")
                print(f"   Train Acc={acc_train:.3f}, ROC_AUC={roc_train:.3f} | Test Acc={acc_test:.3f}, ROC_AUC={roc_test:.3f}")


2025/11/08 20:47:27 INFO mlflow.tracking.fluent: Experiment with name 'loan_approval_experiment_Logistic_Regression_OrdinalEncoding_sampling1' does not exist. Creating a new experiment.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


✅ Run logged: C=0.01, penalty=l1, solver=liblinear
   Train Acc=0.806, ROC_AUC=0.879 | Test Acc=0.784, ROC_AUC=0.880


  y = column_or_1d(y, warn=True)


✅ Run logged: C=0.01, penalty=l1, solver=saga
   Train Acc=0.807, ROC_AUC=0.880 | Test Acc=0.795, ROC_AUC=0.881


  y = column_or_1d(y, warn=True)


✅ Run logged: C=0.01, penalty=l2, solver=liblinear
   Train Acc=0.804, ROC_AUC=0.878 | Test Acc=0.782, ROC_AUC=0.879


  y = column_or_1d(y, warn=True)


✅ Run logged: C=0.01, penalty=l2, solver=saga
   Train Acc=0.806, ROC_AUC=0.880 | Test Acc=0.795, ROC_AUC=0.881


  y = column_or_1d(y, warn=True)


✅ Run logged: C=0.1, penalty=l1, solver=liblinear
   Train Acc=0.807, ROC_AUC=0.880 | Test Acc=0.797, ROC_AUC=0.881


  y = column_or_1d(y, warn=True)


✅ Run logged: C=0.1, penalty=l1, solver=saga
   Train Acc=0.807, ROC_AUC=0.880 | Test Acc=0.798, ROC_AUC=0.882


  y = column_or_1d(y, warn=True)


✅ Run logged: C=0.1, penalty=l2, solver=liblinear
   Train Acc=0.807, ROC_AUC=0.880 | Test Acc=0.796, ROC_AUC=0.881


  y = column_or_1d(y, warn=True)


✅ Run logged: C=0.1, penalty=l2, solver=saga
   Train Acc=0.807, ROC_AUC=0.880 | Test Acc=0.798, ROC_AUC=0.882


  y = column_or_1d(y, warn=True)


✅ Run logged: C=1, penalty=l1, solver=liblinear
   Train Acc=0.807, ROC_AUC=0.880 | Test Acc=0.799, ROC_AUC=0.881


  y = column_or_1d(y, warn=True)


✅ Run logged: C=1, penalty=l1, solver=saga
   Train Acc=0.807, ROC_AUC=0.880 | Test Acc=0.799, ROC_AUC=0.881


  y = column_or_1d(y, warn=True)


✅ Run logged: C=1, penalty=l2, solver=liblinear
   Train Acc=0.807, ROC_AUC=0.880 | Test Acc=0.799, ROC_AUC=0.881


  y = column_or_1d(y, warn=True)


✅ Run logged: C=1, penalty=l2, solver=saga
   Train Acc=0.807, ROC_AUC=0.880 | Test Acc=0.799, ROC_AUC=0.881


  y = column_or_1d(y, warn=True)


✅ Run logged: C=10, penalty=l1, solver=liblinear
   Train Acc=0.807, ROC_AUC=0.880 | Test Acc=0.799, ROC_AUC=0.881


  y = column_or_1d(y, warn=True)


✅ Run logged: C=10, penalty=l1, solver=saga
   Train Acc=0.807, ROC_AUC=0.880 | Test Acc=0.799, ROC_AUC=0.881


  y = column_or_1d(y, warn=True)


✅ Run logged: C=10, penalty=l2, solver=liblinear
   Train Acc=0.807, ROC_AUC=0.880 | Test Acc=0.799, ROC_AUC=0.881




✅ Run logged: C=10, penalty=l2, solver=saga
   Train Acc=0.807, ROC_AUC=0.880 | Test Acc=0.799, ROC_AUC=0.881


In [16]:
lg_reg = LogisticRegression(max_iter=2000, random_state=42)

param_grid = {
    "C": [0.01, 0.1, 1, 10, 100],
    "penalty": ["l1", "l2"],
    "solver": ["liblinear", "saga"],
}
grid = GridSearchCV(
    estimator=lg_reg,
    param_grid=param_grid,
    scoring="roc_auc",   
    cv=5,                
    n_jobs=-1,           
    verbose=2
)

grid.fit(X_res, y_res)

print("Best Parameters:", grid.best_params_)
print("Best CV ROC AUC:", grid.best_score_)

best_model = grid.best_estimator_

y_pred = best_model.predict(xtest)
y_prob = best_model.predict_proba(xtest)[:, 1]


print("Accuracy:", accuracy_score(ytest, y_pred))
print("ROC AUC:", roc_auc_score(ytest, y_prob))
print("\nClassification Report:\n", classification_report(ytest, y_pred))
print("Confusion Matrix:\n", confusion_matrix(ytest, y_pred))

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Parameters: {'C': 0.1, 'penalty': 'l1', 'solver': 'saga'}
Best CV ROC AUC: 0.8799095468601147
Accuracy: 0.7978514792394918
ROC AUC: 0.8815552863051259

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.80      0.87     10087
           1       0.39      0.81      0.53      1642

    accuracy                           0.80     11729
   macro avg       0.68      0.80      0.70     11729
weighted avg       0.88      0.80      0.82     11729

Confusion Matrix:
 [[8022 2065]
 [ 306 1336]]


  y = column_or_1d(y, warn=True)
