## Load the dataset and convert it into a pandas DataFrame:



In [0]:
!pip install optuna lightgbm xgboost

In [0]:

import pandas
import optuna
import mlflow
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, classification_report
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier




### Load the data

In [0]:
train_df = spark.table("credit_catalog.gold.ml_train").toPandas()
test_df = spark.table("credit_catalog.gold.ml_test").toPandas()


In [0]:
train_df.duplicated().sum()

In [0]:
train_df.select_dtypes('number').columns

In [0]:
X = train_df.drop(columns=['Credit_Score'])
y = train_df['Credit_Score']

In [0]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)



In [0]:
# train test split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [0]:
print("The size of train data is",X_train.shape)
print("The shape of test data is",X_test.shape)

In [0]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

classes = np.unique(y_train)

class_weights = compute_class_weight(
    class_weight="balanced",
    classes=classes,
    y=y_train
)

class_weight_dict = dict(zip(classes, class_weights))


In [0]:
sample_weight = np.array([class_weight_dict[label] for label in y_train])

In [0]:
ohe_encode = ['Occupation','Income_Group', 'Payment_of_Min_Amount']
ordinal_encode = ['Credit_Mix']
standard_scale = ['Age', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts', 'Num_Credit_Card',
       'Interest_Rate', 'Num_of_Loan', 'Delay_from_due_date',
       'Num_of_Delayed_Payment', 'Changed_Credit_Limit',
       'Num_Credit_Inquiries', 'Outstanding_Debt', 'Credit_Utilization_Ratio',
       'Credit_History_Age', 'Total_EMI_per_month', 'Amount_invested_monthly',
       'Monthly_Balance', 'has_auto_loan', 'has_credit_builder_loan',
       'has_debt_consolidation_loan', 'has_home_equity_loan',
       'has_mortgage_loan', 'has_payday_loan', 'has_personal_loan',
       'has_student_loan', 'spend_level', 'txn_value_level',
       'Debt_to_Income_Ratio', 'EMI_to_Salary_Ratio', 'Saving_Capacity']

In [0]:
# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('ohe',
         OneHotEncoder(drop='first',
                       sparse_output=False,
                       handle_unknown='ignore'),
         ohe_encode),

        ('ordinal',
         OrdinalEncoder(categories=[['bad', 'good', 'standard']]),
         ordinal_encode),

        ('standard',
         StandardScaler(),
         standard_scale)
    ],
    remainder="passthrough",
    n_jobs=-1,
    force_int_remainder_cols=False,
    verbose_feature_names_out=False
)

preprocessor.set_output(transform="pandas")

In [0]:
X_train_trans = preprocessor.fit_transform(X_train)
X_val_trans = preprocessor.transform(X_test)
X_test_trans = preprocessor.transform(test_df)



In [0]:
# mlflow experiment
mlflow.set_experiment("/Users/souravraj664@gmail.com/Exp 1 - Model Selection")

In [0]:
def objective(trial):
    with mlflow.start_run(nested=True):
        model_name = trial.suggest_categorical("model",["RF","GB","XGB","LGBM"])
                
        if model_name == "RF":
            n_estimators_rf = trial.suggest_int("n_estimators_rf",10,200)
            max_depth_rf = trial.suggest_int("max_depth_rf",2,20)
            model = RandomForestClassifier(n_estimators=n_estimators_rf,
                                        max_depth=max_depth_rf,
                                        random_state=42,
                                        n_jobs=-1)

        elif model_name == "GB":
            n_estimators_gb = trial.suggest_int("n_estimators_gb",10,200)
            learning_rate_gb = trial.suggest_float("learning_rate_gb",0,1)
            max_depth_gb = trial.suggest_int("max_depth_gb",2,20)
            model = GradientBoostingClassifier(n_estimators=n_estimators_gb,
                                                learning_rate=learning_rate_gb,
                                                max_depth=max_depth_gb,
                                                random_state=42)

        elif model_name == "XGB":
            n_estimators_xgb = trial.suggest_int("n_estimators_xgb",10,200)
            learning_rate_xgb = trial.suggest_float("learning_rate_xgb",0.1,0.5)
            max_depth_xgb = trial.suggest_int("max_depth_xgb",2,20)
            model = XGBClassifier(n_estimators=n_estimators_xgb,
                                    learning_rate=learning_rate_xgb,
                                    max_depth=max_depth_xgb,
                                    random_state=42,
                                    n_jobs=-1)

        elif model_name == "LGBM":
            n_estimators_lgbm = trial.suggest_int("n_estimators_lgbm",10,200)
            learning_rate_lgbm = trial.suggest_float("learning_rate_lgbm",0.1,0.5)
            max_depth_lgbm = trial.suggest_int("max_depth_lgbm",2,20)
            model = LGBMClassifier(n_estimators=n_estimators_lgbm,
                                    learning_rate=learning_rate_lgbm,
                                    max_depth=max_depth_lgbm,
                                    random_state=42)
        
        full_pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model),
             ])

        # train the model
        full_pipe.fit(X_train,y_train, model__sample_weight=sample_weight)
        
        
      
        # log model params
        mlflow.log_params(model.get_params())

        # get the predictions
        y_pred_train = full_pipe.predict(X_train)
        y_pred_test = full_pipe.predict(X_test)

        
        # calculate the accuracy
        accuracy = accuracy_score(y_train,y_pred_train)
    
        # log model_name
        mlflow.log_param("model",model_name)

        # log accuracy
        mlflow.log_metric("accuracy_score",accuracy)

 

        return accuracy

In [0]:
# create optuna study
study = optuna.create_study(direction="maximize",study_name="model_selection")

with mlflow.start_run(run_name="Best Model") as parent:
    # optimize the objective function
    study.optimize(objective,n_trials=10,n_jobs=-1)

    # log the best parameters
    mlflow.log_params(study.best_params)

    # log the best score
    mlflow.log_metric("best_score",study.best_value)

In [0]:
# dataframe of results

study.trials_dataframe()

In [0]:
study.trials_dataframe()['params_model'].value_counts()