In [None]:
import pickle
import yaml

import matplotlib.pyplot as plt
import pandas as pd
import xgboost as xgb
from sklearn.metrics import roc_auc_score
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier


In [None]:
import pandas as pd
import pickle
from category_encoders import CatBoostEncoder

class DataProcessor:
    @staticmethod
    def load_category_encoder(category_encoder_path: str):
        return pickle.load(open(category_encoder_path, "rb"))
    
    @staticmethod
    def save_category_encoder(category_encoder, category_encoder_path):
        pickle.dump(category_encoder, open(category_encoder_path, "wb"))

    @staticmethod
    def apply_category_features(df, category_columns=None, category_encoder=None):
        df[category_columns] = category_encoder.transform(df[category_columns])
        return df
    
    @staticmethod
    def process_data(df, cfg):
        category_columns = cfg.feature_config["category_columns"]
        Enc = CatBoostEncoder(cols=category_columns)
        Enc.fit_transform(df[category_columns], df[cfg.feature_config["target_column"]])
        DataProcessor.save_category_encoder(Enc, cfg.category_index_path)
        encoded_df = df.copy()
        encoded_df[category_columns] = Enc.transform(df[category_columns])
        return encoded_df

    @staticmethod
    def apply_process_data(df, cfg, Enc):
        category_columns = cfg.feature_config["category_columns"]
        encoded_df = DataProcessor.apply_category_features(df, category_columns, Enc)
        return encoded_df

In [None]:
class cfg1:
    data_path = "./prob1/data/cleaned_combined_data.csv"
    model_path = "./prob_1_model.pkl"
    onnx_path = "./prob_1_onnx_model.onnx"
    category_index_path = "./prob_1_category_index_path.pkl"
    feature_config = {
        "numeric_columns": [
            "feature1",
            "feature5",
            "feature6",
            "feature7",
            "feature8",
            "feature9",
            "feature10",
            "feature11",
            "feature12",
            "feature13",
            "feature14",
            "feature15",
            "feature16",
            "feature17",
            "feature18",
            "feature19",
            "feature20",
            "feature21",
            "feature22",
            "feature23",
            "feature24",
            "feature25",
            "feature26",
            "feature27",
            "feature28",
            "feature29",
            "feature30",
            "feature31",
            "feature32",
            "feature33",
            "feature34",
            "feature35",
            "feature36",
            "feature37",
            "feature38",
            "feature39",
            "feature40",
            "feature41"
        ],
        "category_columns": [
            "feature2",
            "feature3",
            "feature4"
        ],
        "target_column": "label",
        "ml_type": "classification"
    }

In [None]:
try:
    df = pd.read_parquet(cfg1.data_path)
except:
    df = pd.read_csv(cfg1.data_path)
df.head()

In [None]:
df['label'].value_counts()

In [None]:
processed_df = DataProcessor.process_data(df, cfg1)
processed_df.head()

In [None]:
target_column = "label"
# feature_columns = cfg1.feature_config["category_columns"] + cfg1.feature_config["numeric_columns"]
feature_columns = cfg1.feature_config["numeric_columns"]

X = processed_df[feature_columns]
y = processed_df[target_column]

In [None]:
import optuna
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
from optuna.integration import LightGBMPruningCallback

def objective(trial, X, y):
    param_grid = {
        # "device_type": trial.suggest_categorical("device_type", ['gpu']),
        "n_estimators": trial.suggest_int("n_estimators", 5, 300, step=5),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
        "num_leaves": trial.suggest_int("num_leaves", 10, 200, step=5),
        "max_depth": trial.suggest_int("max_depth", 1, 10),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 200, step=5),
        "reg_alpha": trial.suggest_int("reg_alpha", 0, 20, step=5),
        "reg_lambda": trial.suggest_int("reg_lambda", 0, 20, step=5),
        "min_split_gain": trial.suggest_float("min_split_gain", 0, 5),
        "subsample": trial.suggest_float(
            "subsample", 0.2, 0.95, step=0.1
        ),
        "colsample_bytree": trial.suggest_float(
            "colsample_bytree", 0.2, 0.95, step=0.1
        ),
    }
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=304)
    cv_scores = np.empty(5)
    oof_targets = []
    final_valid_predictions = []
    for idx, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[val_idx]

        model = LGBMClassifier(objective="binary", **param_grid, verbosity=-1)
        model.fit(X_train, y_train, eval_set=[(X_test, y_test)],
                  eval_metric="auc",
                  callbacks=[
                      LightGBMPruningCallback(trial, "auc")
                  ],
                 verbose=False)
#         preds = model.predict_proba(X_test)
#         cv_scores[idx] = log_loss(y_test, preds)
        preds = model.predict_proba(X_test)[:, 1]
        final_valid_predictions.extend(preds)
        oof_targets.extend(y_test)

    oof_preds = np.array(final_valid_predictions)
    oof_targets = np.array(oof_targets)
#     best_auc = 0
#     thres_list = [i / 100 for i in range(1, 101)]
#     for threshold in thres_list:
#         auc = roc_auc_score(oof_targets, (oof_preds>=threshold).astype(int))
#         if auc > best_auc:
#             best_auc, BEST_THRESHOLD = auc, threshold
    
#     return best_auc
    return roc_auc_score(oof_targets, oof_preds)
#     return np.mean(cv_scores)

In [None]:
def logging_callback(study, frozen_trial):
    previous_best_value = study.user_attrs.get("previous_best_value", None)
    if previous_best_value != study.best_value:
        study.set_user_attr("previous_best_value", study.best_value)
        print(
            "Trial {} finished with best value: {} and parameters: {}. ".format(
            frozen_trial.number,
            frozen_trial.value,
            frozen_trial.params,
            )
        )
    else:
        if (frozen_trial.number % 50 == 0):
            print(
                "Trial {} finished with best value: {}.".format(
                frozen_trial.number,
                frozen_trial.value
                )
            )

In [None]:
import warnings
warnings.filterwarnings("ignore")
optuna.logging.set_verbosity(optuna.logging.DEBUG)
from warnings import simplefilter
simplefilter("ignore", category=RuntimeWarning)
optuna.logging.set_verbosity(optuna.logging.WARNING)


# study = optuna.create_study(direction="minimize", study_name="LGBMClassifier")
study = optuna.create_study(direction="maximize", study_name="LGBMClassifier")

func = lambda trial: objective(trial, X, y)
study.optimize(func, n_trials=200, callbacks=[logging_callback])

In [None]:
print(f"\tBest value (rmse): {study.best_value:.5f}")
print(f"\tBest params:")

for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")

In [None]:
from optuna.visualization import plot_optimization_history

plotly_config = {"staticPlot": True}

fig = plot_optimization_history(study)
fig.show(config=plotly_config)

In [None]:
from optuna.visualization import plot_param_importances

fig = plot_param_importances(study)
fig.show(config=plotly_config)