In [3]:
import joblib
import sklearn
from sklearn.metrics import classification_report, roc_curve, confusion_matrix
import pandas as pd
import optuna
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import f1_score, recall_score, precision_score
# from imblearn.pipeline import Pipeline as imPipeline
# from imblearn.over_sampling import ADASYN
import warnings
warnings.filterwarnings("ignore")

In [2]:
# ! pip install imblearn 
! pip install ipywidgets

Collecting ipywidgets
  Downloading ipywidgets-8.1.7-py3-none-any.whl.metadata (2.4 kB)
Collecting widgetsnbextension~=4.0.14 (from ipywidgets)
  Downloading widgetsnbextension-4.0.14-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab_widgets~=3.0.15 (from ipywidgets)
  Downloading jupyterlab_widgets-3.0.15-py3-none-any.whl.metadata (20 kB)
Downloading ipywidgets-8.1.7-py3-none-any.whl (139 kB)
Downloading jupyterlab_widgets-3.0.15-py3-none-any.whl (216 kB)
Downloading widgetsnbextension-4.0.14-py3-none-any.whl (2.2 MB)
   ---------------------------------------- 0.0/2.2 MB ? eta -:--:--
   --------- ------------------------------ 0.5/2.2 MB 1.5 MB/s eta 0:00:02
   ----------------------- ---------------- 1.3/2.2 MB 2.8 MB/s eta 0:00:01
   --------------------------------- ------ 1.8/2.2 MB 3.1 MB/s eta 0:00:01
   ---------------------------------------- 2.2/2.2 MB 2.9 MB/s eta 0:00:00
Installing collected packages: widgetsnbextension, jupyterlab_widgets, ipywidgets

   ---------

In [4]:
# load cleaned dataset
df = joblib.load("../src/cleaned_loan_df.pkl")
df.head(1)

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,56,85994,50587,520,80,4,15.23,36,0.44,Bachelor's,Full-time,Divorced,Yes,Yes,Other,Yes,0


In [5]:
X = df.drop("Default", axis=1)
y = df["Default"]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.3, random_state=42)

In [7]:
joblib.dump(X_train, "../src/X_train.pkl")
joblib.dump(X_test, "../src/X_test.pkl")
joblib.dump(y_train, "../src/y_train.pkl")
joblib.dump(y_test, "../src/y_test.pkl")

['../src/y_test.pkl']

In [39]:
# X_test

In [8]:
# load column transformer
encoding = joblib.load("../src/encoder.pkl")
scaling = joblib.load("../src/scaler.pkl"
)

In [21]:
# 3 load features dictinoary
features_dict = joblib.load("../src/features_config.pkl")

In [8]:
encoding

In [44]:
features_dict["numeric_features"]

['Age',
 'Income',
 'LoanAmount',
 'CreditScore',
 'MonthsEmployed',
 'NumCreditLines',
 'InterestRate',
 'LoanTerm',
 'DTIRatio']

In [16]:
# ! pip install -r ../requirements.txt
# ! pip install scikit-learn==1.1.3


In [9]:
from xgboost import XGBClassifier
from sklearn.base import BaseEstimator, ClassifierMixin

class SklearnXGBWrapper(XGBClassifier, BaseEstimator, ClassifierMixin):
    def __sklearn_tags__(self):
        return {"binary_only": False, "requires_y": True}


In [12]:
# Optuna objective
def objective(trial):
    model_type = trial.suggest_categorical("model_type", ["LogReg", "RF"])

    if model_type == "LogReg":
        C = trial.suggest_loguniform("C", 1e-4, 1e2)
        penalty = trial.suggest_categorical("penalty", ["l1", "l2"])
        solver = "liblinear" if penalty == "l1" else "lbfgs"
        clf = LogisticRegression(C=C, penalty=penalty, solver=solver, max_iter=1000)
    elif model_type == "RF":
        n_estimators = trial.suggest_int("n_estimators", 100, 300)
        max_depth = trial.suggest_int("max_depth", 3, 10)
        clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    else:  # XGB
        n_estimators = trial.suggest_int("n_estimators", 100, 300)
        max_depth = trial.suggest_int("max_depth", 3, 10)
        learning_rate = trial.suggest_loguniform("learning_rate", 0.01, 0.3)
        # clf = xgb.XGBClassifier(n_estimators=n_estimators, max_depth=max_depth,
        #                         learning_rate=learning_rate, eval_metric='logloss', use_label_encoder=False)
        clf = SklearnXGBWrapper(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        eval_metric='logloss',
        use_label_encoder=False
)


    pipeline = imPipeline([
        ("encoding", encoding),
        ("adasyn", ADASYN(random_state=42)),
        ("scaling", scaling),
        ("classifier", clf)
    ])

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    trial.set_user_attr("precision", prec)
    trial.set_user_attr("recall", rec)

    return f1

# 6. Run study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

# 7. Show best params
print("Best params:", study.best_params)

# 8. Train best pipeline again with predict_proba
best_model_type = study.best_params["model_type"]

if best_model_type == "LogReg":
    C = study.best_params["C"]
    penalty = study.best_params["penalty"]
    solver = "liblinear" if penalty == "l1" else "lbfgs"
    best_clf = LogisticRegression(C=C, penalty=penalty, solver=solver, max_iter=1000)
elif best_model_type == "RF":
    best_clf = RandomForestClassifier(
        n_estimators=study.best_params["n_estimators"],
        max_depth=study.best_params["max_depth"],
        random_state=42
    )
else:
    best_clf = xgb.XGBClassifier(
        n_estimators=study.best_params["n_estimators"],
        max_depth=study.best_params["max_depth"],
        learning_rate=study.best_params["learning_rate"],
        eval_metric='logloss',
        use_label_encoder=False
    )

final_pipeline = imPipeline([
    ("encoding", encoding),
    ("adasyn", ADASYN(random_state=42)),
    ("scaling", scaling),
    ("classifier", best_clf)
])

final_pipeline = final_pipeline.fit(X_train, y_train)
metadata = {
    "model": final_pipeline,
    "sklearn_version": sklearn.__version__
}
joblib.dump(metadata, "final_pipeline_with_meta.pkl")

joblib.dump(study, "study.pkl")


[I 2025-07-17 10:53:27,043] A new study created in memory with name: no-name-4d8021ab-9a67-4e74-a24a-337131f574f6


[I 2025-07-17 10:53:40,089] Trial 0 finished with value: 0.3305001199008819 and parameters: {'model_type': 'LogReg', 'C': 66.48180886510872, 'penalty': 'l1'}. Best is trial 0 with value: 0.3305001199008819.
[I 2025-07-17 10:53:46,778] Trial 1 finished with value: 0.3304737038418501 and parameters: {'model_type': 'LogReg', 'C': 18.774481092661983, 'penalty': 'l1'}. Best is trial 0 with value: 0.3305001199008819.
[I 2025-07-17 10:53:51,903] Trial 2 finished with value: 0.3305001199008819 and parameters: {'model_type': 'LogReg', 'C': 1.5327150389974555, 'penalty': 'l2'}. Best is trial 0 with value: 0.3305001199008819.
[I 2025-07-17 10:53:56,993] Trial 3 finished with value: 0.33046489942720125 and parameters: {'model_type': 'LogReg', 'C': 0.039589883651375564, 'penalty': 'l2'}. Best is trial 0 with value: 0.3305001199008819.
[I 2025-07-17 10:54:58,435] Trial 4 finished with value: 0.1446898959993927 and parameters: {'model_type': 'RF', 'n_estimators': 102, 'max_depth': 5}. Best is trial 0

Best params: {'model_type': 'LogReg', 'C': 0.22980268377099983, 'penalty': 'l1'}


['study.pkl']

In [17]:
# joblib.dump(metadata, "../src/final_pipeline_with_meta.pkl")
joblib.dump(study, "../src/study.pkl")


['../src/study.pkl']

In [None]:
best_model = LogisticRegression(C = 0.10728990079492859, penalty='l1', solver="liblinear", max_iter=1000)

In [26]:
joblib.dump(final_pipeline, "../src/final_pipeline.pkl")

['../src/final_pipeline.pkl']

In [None]:
# Note:
# 1. we could not use feature names in scaling bcz after column transformer we get numpy arrays,
# 2. our logic of scaling by slice is also wrong bcz our feature columns will append at last of our data.
# 3. we will putt numeric features at start of column transformer, which have put
import sklearn
print(sklearn.__version__)


1.1.3


In [None]:

study_df[(study_df.params_model_type == "LogReg") & (study_df.user_attrs_recall>0.67)]

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_C,params_learning_rate,params_max_depth,params_model_type,params_n_estimators,params_penalty,user_attrs_precision,user_attrs_recall,state
3,3,0.330722,2025-07-10 00:30:25.543520,2025-07-10 00:31:10.131958,0 days 00:00:44.588438,0.019259,,,LogReg,,l1,0.216958,0.695316,COMPLETE
5,5,0.330491,2025-07-10 00:37:22.351721,2025-07-10 00:38:26.661882,0 days 00:01:04.310161,16.624916,,,LogReg,,l2,0.216793,0.69498,COMPLETE
9,9,0.330483,2025-07-10 00:42:46.941187,2025-07-10 00:43:54.770512,0 days 00:01:07.829325,1.591114,,,LogReg,,l1,0.216785,0.69498,COMPLETE
10,10,0.325089,2025-07-10 00:43:54.774526,2025-07-10 00:44:32.672091,0 days 00:00:37.897565,0.000383,,,LogReg,,l1,0.213774,0.678283,COMPLETE
11,11,0.330491,2025-07-10 00:44:32.673089,2025-07-10 00:45:15.166831,0 days 00:00:42.493742,82.815265,,,LogReg,,l2,0.216793,0.69498,COMPLETE
12,12,0.33042,2025-07-10 00:45:15.167830,2025-07-10 00:46:02.442618,0 days 00:00:47.274788,0.012451,,,LogReg,,l2,0.216742,0.694868,COMPLETE
13,13,0.33042,2025-07-10 00:46:02.443617,2025-07-10 00:46:45.108861,0 days 00:00:42.665244,0.103146,,,LogReg,,l1,0.216742,0.694868,COMPLETE
14,14,0.330491,2025-07-10 00:46:45.110860,2025-07-10 00:47:41.153484,0 days 00:00:56.042624,64.453093,,,LogReg,,l2,0.216793,0.69498,COMPLETE
15,15,0.330474,2025-07-10 00:47:41.157482,2025-07-10 00:48:49.771023,0 days 00:01:08.613541,0.055605,,,LogReg,,l2,0.216777,0.69498,COMPLETE
16,16,0.3305,2025-07-10 00:48:49.773020,2025-07-10 00:49:50.478838,0 days 00:01:00.705818,1.226054,,,LogReg,,l1,0.2168,0.69498,COMPLETE


In [11]:
study_df = study.trials_dataframe()