In [17]:
import joblib
import pandas as pd
import optuna
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import f1_score, recall_score, precision_score
from imblearn.pipeline import Pipeline as imPipeline
from imblearn.over_sampling import ADASYN
import warnings
warnings.filterwarnings("ignore")

In [28]:
# ! pip install optuna
# ! pip install xgboost

In [7]:
# load cleaned dataset
df = joblib.load("../src/cleaned_loan_df.pkl")
df.head(1)

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,56,85994,50587,520,80,4,15.23,36,0.44,Bachelor's,Full-time,Divorced,Yes,Yes,Other,Yes,0


In [8]:
X = df.drop("Default", axis=1)
y = df["Default"]

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.3, random_state=42)

In [34]:
joblib.dump(X_train, "../src/X_train.pkl")
joblib.dump(X_test, "../src/X_test.pkl")
joblib.dump(y_train, "../src/y_train.pkl")
joblib.dump(y_test, "../src/y_test.pkl")

['../src/y_test.pkl']

In [39]:
# X_test

In [30]:
# load column transformer
encoding = joblib.load("../src/encoder.pkl")
# scaling = joblib.load("../src/scaler.pkl")

In [45]:
scaling = ColumnTransformer([
    ("scale", StandardScaler(), slice(0, len(features_dict["numeric_features"])))
], remainder="passthrough")

In [42]:
# 3 load features dictinoary
features_dict = joblib.load("../src/features_config.pkl")

In [31]:
encoding

In [44]:
features_dict["numeric_features"]

['Age',
 'Income',
 'LoanAmount',
 'CreditScore',
 'MonthsEmployed',
 'NumCreditLines',
 'InterestRate',
 'LoanTerm',
 'DTIRatio']

In [None]:
# Optuna objective
def objective(trial):
    model_type = trial.suggest_categorical("model_type", ["LogReg", "RF", "XGB"])

    if model_type == "LogReg":
        C = trial.suggest_loguniform("C", 1e-4, 1e2)
        penalty = trial.suggest_categorical("penalty", ["l1", "l2"])
        solver = "liblinear" if penalty == "l1" else "lbfgs"
        clf = LogisticRegression(C=C, penalty=penalty, solver=solver, max_iter=1000)
    elif model_type == "RF":
        n_estimators = trial.suggest_int("n_estimators", 100, 300)
        max_depth = trial.suggest_int("max_depth", 3, 10)
        clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    else:  # XGB
        n_estimators = trial.suggest_int("n_estimators", 100, 300)
        max_depth = trial.suggest_int("max_depth", 3, 10)
        learning_rate = trial.suggest_loguniform("learning_rate", 0.01, 0.3)
        clf = xgb.XGBClassifier(n_estimators=n_estimators, max_depth=max_depth,
                                learning_rate=learning_rate, eval_metric='logloss', use_label_encoder=False)

    pipeline = imPipeline([
        ("encoding", encoding),
        ("adasyn", ADASYN(random_state=42)),
        ("scaling", scaling),
        ("classifier", clf)
    ])

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    trial.set_user_attr("precision", prec)
    trial.set_user_attr("recall", rec)

    return f1

# 6. Run study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

# 7. Show best params
print("Best params:", study.best_params)

# 8. Train best pipeline again with predict_proba
best_model_type = study.best_params["model_type"]

if best_model_type == "LogReg":
    C = study.best_params["C"]
    penalty = study.best_params["penalty"]
    solver = "liblinear" if penalty == "l1" else "lbfgs"
    best_clf = LogisticRegression(C=C, penalty=penalty, solver=solver, max_iter=1000)
elif best_model_type == "RF":
    best_clf = RandomForestClassifier(
        n_estimators=study.best_params["n_estimators"],
        max_depth=study.best_params["max_depth"],
        random_state=42
    )
else:
    best_clf = xgb.XGBClassifier(
        n_estimators=study.best_params["n_estimators"],
        max_depth=study.best_params["max_depth"],
        learning_rate=study.best_params["learning_rate"],
        eval_metric='logloss',
        use_label_encoder=False
    )

final_pipeline = imPipeline([
    ("encoding", encoding),
    ("adasyn", ADASYN(random_state=42)),
    ("scaling", scaling),
    ("classifier", best_clf)
])

final_pipeline.fit(X_train, y_train)

[I 2025-07-09 22:50:26,825] A new study created in memory with name: no-name-5758b429-5fc6-463a-9295-9c8d0ea0bd8b
[I 2025-07-09 22:51:17,320] Trial 0 finished with value: 0.3305265401833298 and parameters: {'model_type': 'LogReg', 'C': 0.11905237167991027, 'penalty': 'l1'}. Best is trial 0 with value: 0.3305265401833298.
[I 2025-07-09 22:52:19,457] Trial 1 finished with value: 0.10176488801733925 and parameters: {'model_type': 'XGB', 'n_estimators': 210, 'max_depth': 8, 'learning_rate': 0.030635703187125704}. Best is trial 0 with value: 0.3305265401833298.
[I 2025-07-09 22:59:31,560] Trial 2 finished with value: 0.02675443252999257 and parameters: {'model_type': 'RF', 'n_estimators': 228, 'max_depth': 9}. Best is trial 0 with value: 0.3305265401833298.
[I 2025-07-09 23:00:33,402] Trial 3 finished with value: 0.33050847457627125 and parameters: {'model_type': 'LogReg', 'C': 8.842766039160397, 'penalty': 'l1'}. Best is trial 0 with value: 0.3305265401833298.
[I 2025-07-09 23:01:37,683] T

In [2]:
# Note:
# 1. we could not use feature names in scaling bcz after column transformer we get numpy arrays,
# 2. our logic of scaling by slice is also wrong bcz our feature columns will append at last of our data.
# 3. we will putt numeric features at start of column transformer, which have put

In [None]:
# study.trials_dataframe()