In [2]:
from pymongo import MongoClient
import optuna
import os

os.environ["NEPTUNE_PROJECT"] = "mlop3n/SDP"
os.environ[
    "NEPTUNE_NOTEBOOK_PATH"
] = "PycharmProjects/sdpiit/notebooks/Pipeline_components_builder.ipynb"
import warnings
from sklearnex import patch_sklearn

patch_sklearn()
import numpy as np
import pandas as pd
from category_encoders import (
    BackwardDifferenceEncoder,
    BaseNEncoder,
    BinaryEncoder,
    CatBoostEncoder,
    CountEncoder,
    GLMMEncoder,
    HelmertEncoder,
    JamesSteinEncoder,
    LeaveOneOutEncoder,
    MEstimateEncoder,
    QuantileEncoder,
    SummaryEncoder,
    TargetEncoder,
    WOEEncoder,
)
from sklearn import set_config
from sklearn.base import clone as model_clone
from sklearn.cluster import *
from sklearn.compose import *
from sklearn.cross_decomposition import *
from sklearn.decomposition import *
from sklearn.ensemble import *
from sklearn.feature_selection import *
from sklearn.gaussian_process import *
from sklearn.linear_model import *
from sklearn.metrics import *
from sklearn.model_selection import *
from sklearn.multioutput import *
from sklearn.multiclass import *
from sklearn.naive_bayes import *
from sklearn.neighbors import *
from sklearn.neural_network import *
from sklearn.pipeline import *
from sklearn.preprocessing import *
from sklearn.svm import *
from sklearn.tree import *
from sklearn.utils import *
from sklearn.dummy import *
from sklearn.semi_supervised import *
from sklearn.discriminant_analysis import *
import sklearnex, daal4py

from tqdm import tqdm, trange
from xgboost import XGBClassifier, XGBRFClassifier
from BorutaShap import BorutaShap

from sklearn.calibration import *

pd.options.plotting.backend = "plotly"
pd.options.display.max_columns = 50
set_config(display="diagram")
warnings.filterwarnings("ignore")
import pickle
from collections import defaultdict

import matplotlib.pyplot as plt
import seaborn as sns
from joblib import parallel_backend
from joblib.memory import Memory

sns.set()
from pprint import pprint
from helpers import PolynomialWrapper as PWrapper
from helpers import NestedCVWrapper as NCVWrapper
from helpers import ColumnSelectors
import sklearn

from helpers import DFCollection
from helpers import plot_mean_std_max
from helpers import CustomMetrics
import gc

%matplotlib inline
CACHE_DIR = Memory(location="../data/joblib_memory/")
# OPTUNA_DB = "postgresql+psycopg2://postgres:302492@localhost:5433/optuna"
from REDIS_CONFIG import REDIS_URL

OPTUNA_DB = REDIS_URL


def allow_stopping(func):
    def wrapper():
        try:
            value = func()
            return value
            # gc.collect()
        except KeyboardInterrupt as e:
            print("Program Stopped")
        gc.collect()

    return wrapper

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [4]:
db = DFCollection()
column_selector = ColumnSelectors()
# classifiers = [f() for f in cls_names]
dtype_info = column_selector.dtype_info
ordinal = column_selector.ordinal_cols
nominal = column_selector.nominal_cols
binary = column_selector.binary_cols
ratio = column_selector.ratio_cols


final_data = db.final_data
final_pred_data = db.final_pred_data
baseline_prediction_data = db.baseline_prediction_data
data_logit = db.data_logits
prediction_data = db.prediction_data
master_data = db.master
given_data = db.data

ordinal_data, nominal_data, binary_data, ratio_data = db.categorise_data()
nominal_categories = db.nominal_categories
ordinal_categories = db.ordinal_categories
class_labels, n_classes, class_priors = class_distribution(
    final_data.target.to_numpy().reshape(-1, 1)
)

encoder = OneHotEncoder(sparse=False, drop="first")
variance_thr = VarianceThreshold(0.001)
nominal_ohe_pipe = Pipeline(
    steps=[("ohe", encoder), ("var_th", variance_thr)],
    memory=Memory(location=CACHE_DIR),
)
ohe_nominal_data = nominal_ohe_pipe.fit_transform(
    nominal_data.drop(["nominal__v_12", "nominal__v_21"], axis=1)
)
n1df = pd.DataFrame(
    ohe_nominal_data,
    columns=nominal_ohe_pipe.get_feature_names_out(),
    index=nominal_data.index,
)


def train_test(X, y, test_size):
    """
    X_train, X_test, y_train, y_test
    """
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, random_state=10, test_size=test_size, stratify=y
    )
    return X_train, X_test, y_train, y_test

In [15]:
target = final_data.target
encoder = OneHotEncoder(sparse=False, drop="first", min_frequency=0.0001)
# nominal_data_test = final_pred_data.loc[:,nominal]
variance_thr = VarianceThreshold(0.001)
nominal_ohe_pipe = Pipeline(
    steps=[("ohe", encoder), ("var_th", variance_thr)],
    memory=Memory(location=CACHE_DIR),
)


scf = {"F": f_classif, "CHI": chi2}

scf2 = {"MIC": mutual_info_classif, "F": f_classif, "CHI": chi2}


K_BEST = 40
ALPHA = 0.05
all_selectors = {}
PERCENTILE = 5


def update_selectors(alpha_=ALPHA, k_best=K_BEST, percentile=PERCENTILE):
    global all_selectors, scf, scf2
    for criterion, _scf in scf.items():
        selectors = [
            SelectFpr(_scf, alpha=alpha_),
            SelectFdr(_scf, alpha=alpha_),
            SelectFwe(_scf, alpha=alpha_),
        ]

        for slctr in selectors:
            all_selectors[criterion + "-" + slctr.__class__.__name__] = slctr
    for criterion, _scf in scf2.items():
        selectors = [
            SelectKBest(_scf, k=K_BEST),
            SelectPercentile(_scf, percentile=PERCENTILE),
        ]
        for slctr in selectors:
            all_selectors[criterion + "-" + slctr.__class__.__name__] = slctr


# Initialize the selectors
update_selectors(alpha_=ALPHA, k_best=K_BEST)
ct = ColumnTransformer(
    transformers=[("one_hot_enc", nominal_ohe_pipe, nominal)],
    sparse_threshold=0,
    n_jobs=-1,
)
ohe_nominal_feature_selection = make_pipeline(
    ct, FeatureUnion(transformer_list=list(all_selectors.items()), n_jobs=-1)
)
# with open('../data/pipelines/ohe_nominal_features.pkl', 'wb') as fp:
#     pickle.dump(ohe_nominal_feature_selection, fp, protocol=-1)
# ohe_nominal_feature_selection
# n1df_test = pd.DataFrame

with parallel_backend("loky"):
    elite_ohe_nominal_features_train = ohe_nominal_feature_selection.fit_transform(
        final_data, target
    )
    elite_ohe_nominal_features_test = ohe_nominal_feature_selection.transform(
        final_pred_data
    )
    f_names = ohe_nominal_feature_selection.get_feature_names_out()
    elite_onf_df = pd.DataFrame(elite_ohe_nominal_features_train, columns=f_names)
    elite_onf_df = (
        elite_onf_df.transpose()
        .drop_duplicates(
            ignore_index=False,
        )
        .transpose()
    )

    f_names_t = {
        x: x.split("__")[2] + "__" + x.split("__")[3] for x in elite_onf_df.columns
    }

    # elite_ohe_nominal_features_train = elite_ohe_nominal_features_train.transpose().drop_duplicates(ignore_index=False,).transpose()
    elite_onf_df.rename(columns=f_names_t, inplace=True)
    elite_onf_df_test = pd.DataFrame(elite_ohe_nominal_features_test, columns=f_names)
    elite_onf_df_test = (
        elite_onf_df_test.transpose()
        .drop_duplicates(
            ignore_index=False,
        )
        .transpose()
    )
    elite_onf_df_test.rename(columns=f_names_t, inplace=True)

# # model = LogisticRegression(random_state=10,max_iter=10000, n_jobs=-1) # scoring='f1_macro', cv=RepeatedStratifiedKFold(n_repeats=5))
# # model = SVC(random_state=10,break_ties=True,)
# model = CategoricalNB(min_categories=2)
# # model= RandomForestClassifier(random_state=20, n_jobs=-1)
# # model = BernoulliNB(binarize=None)
# # model= GaussianProcessClassifier(n_jobs=-1,random_state=10,n_restarts_optimizer=10)
# ovo = OneVsOneClassifier(clone(model), n_jobs=-1)
# ovr = OneVsRestClassifier(clone(model), n_jobs=-1)

# tmp = ovo

# with parallel_backend("loky"):
#     X_train, X_test, y_train, y_test = train_test(elite_onf_df, target, test_size=0.3)
#     y_pred2 = tmp.fit(X_train, y_train).predict(X_test)
#     print(classification_report(y_test, y_pred2))

In [16]:
target = final_data.target

categorical_target_encoders_1 = FeatureUnion(
    transformer_list=[
        (
            "target_enc",
            PWrapper(
                TargetEncoder(cols=nominal + ordinal, drop_invariant=True, smoothing=0)
            ),
        ),
        ("woe_enc", PWrapper(WOEEncoder(cols=nominal + ordinal, drop_invariant=True))),
        (
            "jame_enc",
            PWrapper(JamesSteinEncoder(cols=nominal + ordinal, drop_invariant=True)),
        ),
    ],
    n_jobs=-1,
)

categorical_target_encoders_2 = FeatureUnion(
    transformer_list=[
        ("summary_enc", SummaryEncoder(cols=nominal + ordinal, drop_invariant=True)),
        # ('woe_enc', PWrapper(WOEEncoder(cols=nominal+ordinal,drop_invariant=True))),
        # ('backward_diff', BackwardDifferenceEncoder(cols=nominal+ordinal,drop_invariant=True)),
        (
            "glmm_enc",
            PWrapper(GLMMEncoder(cols=nominal + ordinal, drop_invariant=True)),
        ),
    ],
    n_jobs=-1,
)
# feature_selector = RFECV(estimator=LogisticRegression(max_iter=1000000,random_state=10), scoring='f1_macro', cv=RepeatedStratifiedKFold(n_repeats=2), step=6,n_jobs=-1)
feature_selector = RFECV(
    estimator=DecisionTreeClassifier(random_state=10),
    scoring="f1_macro",
    cv=RepeatedStratifiedKFold(n_repeats=2),
    step=1,
    n_jobs=-1,
)


cat_feature_embedding_1 = ColumnTransformer(
    transformers=[
        ("cat_enc_1", categorical_target_encoders_1, nominal + ordinal),
    ],
    sparse_threshold=0,
    n_jobs=-1,
)
cat_feature_embedding_2 = ColumnTransformer(
    transformers=[
        ("cat_enc_2", categorical_target_encoders_2, nominal + ordinal),
    ],
    sparse_threshold=0,
    n_jobs=-1,
)

# feature_selection = FeatureUnion(transformer_list=list(all_selectors.items()),n_jobs=-1)

# f_gen_workflow_1 = make_pipeline(cat_feature_embedding_1,clone(feature_selector))
# f_gen_workflow_2 = make_pipeline(cat_feature_embedding_2,clone(feature_selector))

cat_features_best = FeatureUnion(
    transformer_list=[
        ("set1_cat_encs", cat_feature_embedding_1),
        ("set2_cat_encs", cat_feature_embedding_2),
    ],
    n_jobs=-1,
)

cat_features_la_creme = make_pipeline(cat_features_best, feature_selector)

In [17]:
cat_features_la_creme

In [18]:
# workflow
"""
TEST
"""
target = final_data.target
X_train, X_test, y_train, y_test = train_test(final_data, target, test_size=0.2)

ohe_ref_train = X_train.index
ohe_ref_test = X_test.index
ohe_data_train = elite_onf_df.loc[ohe_ref_train,:].to_numpy()
ohe_data_test = elite_onf_df.loc[ohe_ref_test,:].to_numpy()


with parallel_backend("loky"):
    X_train_enc = cat_features_la_creme.fit_transform(X_train, y_train)
    X_test_enc = cat_features_la_creme.transform(X_test)
# cat_features_la_creme
X_train_complete = np.c_[ohe_data_train,X_train_enc]
X_test_complete = np.c_[ohe_data_test,X_test_enc]

In [19]:
XGBOOST_OPT_TRIAL_DATA = (X_train_complete, X_test_complete, y_train, y_test)
import joblib

joblib.dump(XGBOOST_OPT_TRIAL_DATA, "../data/xgboost_optuna_trial_data/data.pkl")

['../data/xgboost_optuna_trial_data/data.pkl']

In [7]:
import joblib
XGBOOST_OPT_TRIAL_DATA = joblib.load("../data/xgboost_optuna_trial_data/data.pkl")


train_x, valid_x, train_y, valid_y = XGBOOST_OPT_TRIAL_DATA
pd.DataFrame(train_x)

In [15]:
import optuna
summaries = optuna.get_all_study_summaries(storage=OPTUNA_DB)
for s in summaries:
    print(s.study_name)
stu = optuna.load_study(study_name='XGB.Beta.1', storage=OPTUNA_DB)

Test
XGBRF
XGB.1
XGB.2
XGB.3
XGB.4
XGB.5
XGB.6
XGB.7
XGB.8
XGB.9
XGB.Beta
XGB.Beta.1


In [17]:
stu.best_params

{'tree_method': 'exact',
 'booster': 'gbtree',
 'lambda': 7.201445751243543e-08,
 'alpha': 5.7774970764535644e-05,
 'subsample': 0.9872743878417242,
 'colsample_bytree': 0.8814057813803404,
 'colsample_bylevel': 0.5228488338536805,
 'colsample_bynode': 0.9442981584633299,
 'max_depth': 7,
 'min_child_weight': 7,
 'gamma': 0.005589755633642736,
 'grow_policy': 'depthwise',
 'learning_rate': 0.19721247377460083}

In [15]:
estim = DecisionTreeClassifier(random_state=21)
# estim = MLPClassifier(solver='adam', max_iter=1000,random_state=21)
estim = RandomForestClassifier(random_state=10)
estim = ExtraTreesClassifier(random_state=19)
estim = XGBClassifier(random_state=10)
model = OneVsOneClassifier(estim, n_jobs=-1)

with parallel_backend("loky"):
    y_pred = model.fit(X_train_enc, y_train).predict(X_test_enc)
    print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.65      0.69       249
           1       0.75      0.85      0.80       373
           2       0.43      0.37      0.40       138

    accuracy                           0.70       760
   macro avg       0.64      0.62      0.63       760
weighted avg       0.69      0.70      0.69       760



In [None]:
XGBRFClassifier().get_xgb_params()

In [None]:
"""
Optuna example that optimizes a classifier configuration for cancer dataset
using XGBoost.
In this example, we optimize the validation accuracy of cancer detection
using XGBoost. We optimize both the choice of booster model and its
hyperparameters.
"""

import numpy as np
import optuna
import neptune
import sklearn.datasets
import sklearn.metrics
from sklearn.model_selection import train_test_split
import xgboost as xgb
run = neptune.init(
    project="mlop3n/SDP",
    custom_run_id="XGBRF",
    mode="async",
)  # your credentials
XGBClassifier()

def objective(trial):
    (data, target) = sklearn.datasets.load_breast_cancer(return_X_y=True)
    train_x, valid_x, train_y, valid_y = train_test_split(data, target, test_size=0.25)
    dtrain = xgb.DMatrix(train_x, label=train_y)
    dvalid = xgb.DMatrix(valid_x, label=valid_y)

    param = {
        "verbosity": 0,
        "objective": "binary:logistic",
        # use exact for small dataset.
        "tree_method": : trial.suggest_categorical("tree_method", ["exact", "approx", "hist"]),
        # defines booster, gblinear for linear functions.
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        # L2 regularization weight.
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        # L1 regularization weight.
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        # sampling ratio for training data.
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        # sampling according to each tree.
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
    }

    if param["booster"] in ["gbtree", "dart"]:
        # maximum depth of the tree, signifies complexity of the tree.
        param["max_depth"] = trial.suggest_int("max_depth", 3, 9, step=2)
        # minimum child weight, larger the term more conservative the tree.
        param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        # defines how selective algorithm is.
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])

    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

    bst = xgb.train(param, dtrain)
    preds = bst.predict(dvalid)
    pred_labels = np.rint(preds)
    accuracy = sklearn.metrics.accuracy_score(valid_y, pred_labels)
    return accuracy


if __name__ == "__main__":
    study = optuna.create_study(study_name="Test",storage=OPTUNA_DB,direction="maximize",load_if_exists=True)
    study.optimize(objective, n_trials=5, timeout=600)

    print("Number of finished trials: ", len(study.trials))
    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

In [None]:
from pymongo import MongoClient
import optuna
import os

os.environ["NEPTUNE_PROJECT"] = "mlop3n/SDP"
os.environ[
    "NEPTUNE_NOTEBOOK_PATH"
] = "PycharmProjects/sdpiit/notebooks/Pipeline_components_builder.ipynb"
import warnings
from sklearnex import patch_sklearn

patch_sklearn()
import numpy as np
import pandas as pd
from category_encoders import (
    BackwardDifferenceEncoder,
    BaseNEncoder,
    BinaryEncoder,
    CatBoostEncoder,
    CountEncoder,
    GLMMEncoder,
    HelmertEncoder,
    JamesSteinEncoder,
    LeaveOneOutEncoder,
    MEstimateEncoder,
    QuantileEncoder,
    SummaryEncoder,
    TargetEncoder,
    WOEEncoder,
)
from sklearn import set_config
from sklearn.base import clone as model_clone
from sklearn.cluster import *
from sklearn.compose import *
from sklearn.cross_decomposition import *
from sklearn.decomposition import *
from sklearn.ensemble import *
from sklearn.feature_selection import *
from sklearn.gaussian_process import *
from sklearn.linear_model import *
from sklearn.metrics import *
from sklearn.model_selection import *
from sklearn.multioutput import *
from sklearn.multiclass import *
from sklearn.naive_bayes import *
from sklearn.neighbors import *
from sklearn.neural_network import *
from sklearn.pipeline import *
from sklearn.preprocessing import *
from sklearn.svm import *
from sklearn.tree import *
from sklearn.utils import *
from sklearn.dummy import *
from sklearn.semi_supervised import *
from sklearn.discriminant_analysis import *
import sklearnex, daal4py
import neptune.new.integrations.optuna as optuna_utils
from tqdm import tqdm, trange
from xgboost import XGBClassifier, XGBRFClassifier
from BorutaShap import BorutaShap
import xgboost as xgb
import xgboost
from sklearn.calibration import *
from neptune.new.integrations.xgboost import NeptuneCallback as neptxgb

pd.options.plotting.backend = "plotly"
pd.options.display.max_columns = 50
set_config(display="diagram")
warnings.filterwarnings("ignore")
import pickle
from collections import defaultdict
import neptune.new as neptune
import matplotlib.pyplot as plt
import seaborn as sns
from joblib import parallel_backend
from joblib.memory import Memory

sns.set()
from pprint import pprint
from helpers import PolynomialWrapper as PWrapper
from helpers import NestedCVWrapper as NCVWrapper
from helpers import ColumnSelectors
import sklearn

from helpers import DFCollection
from helpers import plot_mean_std_max
from helpers import CustomMetrics
import gc
import joblib

# %matplotlib inline
CACHE_DIR = Memory(location="../data/joblib_memory/")
# OPTUNA_DB = "postgresql+psycopg2://postgres:302492@localhost:5433/optuna"
from REDIS_CONFIG import REDIS_URL

os.environ["NEPTUNE_PROJECT"] = "mlop3n/SDP"
CACHE_DIR = Memory(location="../data/joblib_memory/")
OPTUNA_DB = REDIS_URL
run_params = {"directions": "maximize", "n_trials": 5}
run = neptune.init(
    project="mlop3n/SDP",
    api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiI1MzU4OTQ1Ni02ZDMzLTRhNjAtOTFiMC04MjQ5ZDY4MjJjMjAifQ==",
    custom_run_id="XGB.5G",
    mode="offline",
)  # your credentials
# run2 = neptune.init(
#     project="mlop3n/SDP",
#     api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiI1MzU4OTQ1Ni02ZDMzLTRhNjAtOTFiMC04MjQ5ZDY4MjJjMjAifQ==",
#     custom_run_id="XGB.5M",
#     mode="offline",
# )  # your credentials


neptune_xgb = neptxgb(run=run, log_tree=[0, 1, 2, 3])


def allow_stopping(func):
    def wrapper():
        try:
            value = func()
            return value
            # gc.collect()
        except KeyboardInterrupt as e:
            print("Program Stopped")
        gc.collect()

    return wrapper


db = DFCollection()
column_selector = ColumnSelectors()
# classifiers = [f() for f in cls_names]
dtype_info = column_selector.dtype_info
ordinal = column_selector.ordinal_cols
nominal = column_selector.nominal_cols
binary = column_selector.binary_cols
ratio = column_selector.ratio_cols


final_data = db.final_data
final_pred_data = db.final_pred_data
baseline_prediction_data = db.baseline_prediction_data
data_logit = db.data_logits
prediction_data = db.prediction_data
master_data = db.master
given_data = db.data

ordinal_data, nominal_data, binary_data, ratio_data = db.categorise_data()
nominal_categories = db.nominal_categories
ordinal_categories = db.ordinal_categories
class_labels, n_classes, class_priors = class_distribution(
    final_data.target.to_numpy().reshape(-1, 1)
)
XGBOOST_OPT_TRIAL_DATA = joblib.load("../data/xgboost_optuna_trial_data/data.pkl")


def objective(trial: optuna.trial.Trial, data=XGBOOST_OPT_TRIAL_DATA):
    # X_train, X_test, y_train, y_test = XGBOOST_OPT_TRIAL_DATA
    # data, target = sklearn.datasets.load_breast_cancer(return_X_y=True)
    train_x, valid_x, train_y, valid_y = XGBOOST_OPT_TRIAL_DATA
    dtrain = xgb.DMatrix(train_x, label=train_y)
    dvalid = xgb.DMatrix(valid_x, label=valid_y)

    def gen_learning_rate(epoch):
        # assert type(epoch) == 'int'
        return trial.suggest_float("learning_rate", 0, 1)

    param = {
        "verbosity": 0,
        "objective": "multi:softmax",
        "num_class": 3,
        # use exact for small dataset.
        "tree_method": trial.suggest_categorical(
            "tree_method", ["exact", "approx", "hist"]
        ),
        # "updater": trial.suggest_categorical("updater",['grow_colmaker', 'grow_histmaker', 'grow_local_histmaker', 'grow_quantile_histmaker']),
        # defines booster, gblinear for linear functions.
        "booster": trial.suggest_categorical("booster", ["gbtree", "dart"]),
        # L2 regularization weight.
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        # L1 regularization weight.
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        # sampling ratio for training data.
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "sampling_method": "uniform",
        # sampling according to each tree.
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.2, 1.0),
        "colsample_bynode": trial.suggest_float("colsample_bynode", 0.2, 1.0),
        "num_parallel_tree": trial.suggest_int("num_parallel_tree", 1, 10),
    }
    if param["tree_method"] != "exact":
        param["max_bin"] = trial.suggest_int("max_bin", 256, 4096)

    if param["booster"] in ["gbtree", "dart"]:
        # maximum depth of the tree, signifies complexity of the tree.
        param["max_depth"] = trial.suggest_int("max_depth", 3, 9, step=2)
        # minimum child weight, larger the term more conservative the tree.
        param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
        # param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        # defines how selective algorithm is.
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical(
            "grow_policy", ["depthwise", "lossguide"]
        )

    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical(
            "sample_type", ["uniform", "weighted"]
        )
        param["normalize_type"] = trial.suggest_categorical(
            "normalize_type", ["tree", "forest"]
        )
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

    pruning_callback = optuna.integration.XGBoostPruningCallback(
        trial, "validation-mlogloss"
    )
    bst = xgb.train(
        param,
        dtrain,
        num_boost_round=999,
        evals=[(dvalid, "validation")],
        callbacks=[
            # neptune_xgb,
            pruning_callback,
            xgboost.callback.LearningRateScheduler(gen_learning_rate),
            xgboost.callback.EarlyStopping(
                rounds=5,
                min_delta=1e-5,
                save_best=True,
                maximize=False,
                data_name="validation",
                metric_name="mlogloss",
            ),
        ],
    )
    # preds = bst.predict(dvalid)
    # pred_labels = np.rint(preds)
    ypred = bst.predict(dvalid, iteration_range=(0, bst.best_iteration + 1))
    ypred2 = bst.predict(dtrain, iteration_range=(0, bst.best_iteration + 1))
    f1_score_test = sklearn.metrics.f1_score(valid_y, ypred, average="macro")
    f1_score_train = sklearn.metrics.f1_score(train_y, ypred2, average="macro")
    # return f1_score_test, f1_score_train-f1_score_test
    run["f1_score_test"] = f1_score_test
    run["overfitting"] = f1_score_train - f1_score_test
    return f1_score_test


def main(
    params=run_params,
):
    global run
    neptune_callback = optuna_utils.NeptuneCallback(run)
    study = optuna.create_study(
        study_name="XGB.9",
        sampler=optuna.samplers.TPESampler(
            warn_independent_sampling=False,
        ),
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=5), 
        storage=OPTUNA_DB,
        direction=params["directions"],
        load_if_exists=True,
    )
    with parallel_backend("loky"):
        study.optimize(
            objective,
            show_progress_bar=True,
            gc_after_trial=True,
            n_jobs=2,
            n_trials=params["n_trials"],
            callbacks=[neptune_callback],
        )

# updater_types = ['grow_colmaker', 'grow_histmaker', 'grow_local_histmaker', 'grow_quantile_histmaker','grow_gpu_hist', 'sync', 'refresh', 'prune']
if __name__ == "__main__":
    main()
    # pass

In [61]:
params = {
    'tree_method': 'exact',
    'booster': 'gbtree',
    'lambda': 7.201445751243543e-08,
    'alpha': 5.7774970764535644e-05,
    'subsample': 0.9872743878417242,
    'colsample_bytree': 0.8814057813803404,
    'colsample_bylevel': 0.5228488338536805,
    'colsample_bynode': 0.9442981584633299,
    'max_depth': 7,
    'min_child_weight': 7,
    'gamma': 0.005589755633642736,
    'grow_policy': 'depthwise',
    'learning_rate': 0.19721247377460083,
    "verbosity": 0,
    "objective": "multi:softmax",
    "num_class": 3,
    'nthreads':24, "verbosity": 0,
    "objective": "multi:softmax",
    "num_class": 3,
    'nthreads':-1,}

clf = XGBClassifier(n_estimators=15,**params,)
clf

In [62]:
import joblib
import xgboost
XGBOOST_OPT_TRIAL_DATA = joblib.load("../data/xgboost_optuna_trial_data/data.pkl")


train_x, valid_x, train_y, valid_y = XGBOOST_OPT_TRIAL_DATA
clf.fit(
    train_x,
    train_y,
    eval_set=[(valid_x,valid_y)],
    callbacks=[xgboost.callback.EarlyStopping(
        rounds=5,
        min_delta=1e-9,
        save_best=True,
        maximize=False,
        # data_name="validation",
        metric_name="mlogloss",
    )]
)

[0]	validation_0-mlogloss:0.98742
[1]	validation_0-mlogloss:0.91364
[2]	validation_0-mlogloss:0.85912
[3]	validation_0-mlogloss:0.81945
[4]	validation_0-mlogloss:0.78792
[5]	validation_0-mlogloss:0.76414
[6]	validation_0-mlogloss:0.74485
[7]	validation_0-mlogloss:0.73175
[8]	validation_0-mlogloss:0.72074
[9]	validation_0-mlogloss:0.71504
[10]	validation_0-mlogloss:0.70742
[11]	validation_0-mlogloss:0.70399
[12]	validation_0-mlogloss:0.69932
[13]	validation_0-mlogloss:0.69663
[14]	validation_0-mlogloss:0.69788


In [65]:
clf.save_model('../models/xgb_clf_ohe_cat.json')

In [64]:
f1_score(clf.predict(valid_x),valid_y,average='macro')

0.6440068057200946

In [59]:
clf.apply

80

In [28]:
clf.get_num_boosting_rounds()

100

In [31]:
stu.best_trial

FrozenTrial(number=98, values=[0.6692439509938954], datetime_start=datetime.datetime(2022, 7, 23, 13, 26, 58, 3812), datetime_complete=datetime.datetime(2022, 7, 23, 13, 27, 12, 284443), params={'tree_method': 'exact', 'booster': 'gbtree', 'lambda': 7.201445751243543e-08, 'alpha': 5.7774970764535644e-05, 'subsample': 0.9872743878417242, 'colsample_bytree': 0.8814057813803404, 'colsample_bylevel': 0.5228488338536805, 'colsample_bynode': 0.9442981584633299, 'max_depth': 7, 'min_child_weight': 7, 'gamma': 0.005589755633642736, 'grow_policy': 'depthwise', 'learning_rate': 0.19721247377460083}, distributions={'tree_method': CategoricalDistribution(choices=('exact', 'approx', 'hist')), 'booster': CategoricalDistribution(choices=('gbtree', 'dart')), 'lambda': FloatDistribution(high=1.0, log=True, low=1e-08, step=None), 'alpha': FloatDistribution(high=1.0, log=True, low=1e-08, step=None), 'subsample': FloatDistribution(high=1.0, log=False, low=0.5, step=None), 'colsample_bytree': FloatDistribu

In [6]:
run.stop()
# run2.stop()

#### ROC AUC CURVE FOR MULTICLASS
[Link](https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html)

In [None]:
# X_train, X_test, y_train, y_test = train_test(final_data, target, test_size=0.3)
X_train, X_test, y_train, y_test = train_test(final_data, target, test_size=0.3)
with parallel_backend("loky"):
    y_pred = workflow.fit(X_train, y_train).predict(X_test)
    print(classification_report(y_test, y_pred))
with parallel_backend("loky"):
    y_prob = workflow.predict_proba(X_test)

macro_roc_auc_ovo = roc_auc_score(y_test, y_prob, multi_class="ovo", average="macro")
weighted_roc_auc_ovo = roc_auc_score(
    y_test, y_prob, multi_class="ovo", average="weighted"
)
macro_roc_auc_ovr = roc_auc_score(y_test, y_prob, multi_class="ovr", average="macro")
weighted_roc_auc_ovr = roc_auc_score(
    y_test, y_prob, multi_class="ovr", average="weighted"
)
print(
    "One-vs-One ROC AUC scores:\n{:.6f} (macro),\n{:.6f} "
    "(weighted by prevalence)".format(macro_roc_auc_ovo, weighted_roc_auc_ovo)
)
print(
    "One-vs-Rest ROC AUC scores:\n{:.6f} (macro),\n{:.6f} "
    "(weighted by prevalence)".format(macro_roc_auc_ovr, weighted_roc_auc_ovr)
)

#### ENd

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle

from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_auc_score

# Binarize the output
n_classes = 3
model = AdaBoostClassifier(
    base_estimator=RandomForestClassifier(n_jobs=-1), n_estimators=30, random_state=10
)
# workflow = make_pipeline(cat_feature_embedding,StandardScaler(), model)

# shuffle and split training and test sets
## Do all Transformations Ahead then Apply ML Model
X_train, X_test, y_train, y_test = train_test(final_data, target, test_size=0.3)
X_train_enc = cat_feature_embedding.fit_transform(X_train, y_train)
X_test_enc = cat_feature_embedding.transform(X_test)
y_train = label_binarize(y_train, classes=[0, 1, 2])
y_test = label_binarize(y_test, classes=[0, 1, 2])
workflow = make_pipeline(StandardScaler(), OneVsRestClassifier(model, n_jobs=-1))

# Learn to predict each class against the other
# classifier = OneVsRestClassifier(
#     svm.SVC(kernel="linear", probability=True, random_state=random_state)
# )
with parallel_backend("loky"):
    y_score = workflow.fit(X_train_enc, y_train).decision_function(X_test_enc)

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])


# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])

# Finally average it and compute AUC
mean_tpr /= n_classes

fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

# Plot all ROC curves
plt.figure()
plt.plot(
    fpr["micro"],
    tpr["micro"],
    label="micro-average ROC curve (area = {0:0.2f})".format(roc_auc["micro"]),
    color="deeppink",
    linestyle=":",
    linewidth=4,
)

plt.plot(
    fpr["macro"],
    tpr["macro"],
    label="macro-average ROC curve (area = {0:0.2f})".format(roc_auc["macro"]),
    color="navy",
    linestyle=":",
    linewidth=4,
)

colors = cycle(["aqua", "darkorange", "cornflowerblue"])
for i, color in zip(range(n_classes), colors):
    plt.plot(
        fpr[i],
        tpr[i],
        color=color,
        lw=lw,
        label="ROC curve of class {0} (area = {1:0.2f})".format(i, roc_auc[i]),
    )

plt.plot([0, 1], [0, 1], "k--", lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Some extension of Receiver operating characteristic to multiclass")
plt.legend(loc="lower right")
plt.show()

In [None]:
# ct = ColumnTransformer(transformers=[('target',
#                                   PWrapper(JamesSteinEncoder(cols=nominal+ordinal,drop_invariant=True)),
#                                   # JamesSteinEncoder(cols=nominal+ordinal,drop_invariant=True),
#                                       nominal+ordinal
#                                      )],
#                        sparse_threshold=0,
#                        n_jobs=-1)


# # model = LogisticRegression(max_iter=100000,n_jobs=-1,random_state=0)
# model = AdaBoostClassifier(base_estimator=RandomForestClassifier(random_state=10, n_jobs=-1), random_state=10)
# # model = XGBRFClassifier(random_state=0)
# # model = RandomForestClassifier(random_state=0)
# # model = MLPClassifier(solver='adam',activation='tanh')
# # model.pr
# ovo = OneVsOneClassifier(clone(model), n_jobs=-1)
# ovr = OneVsRestClassifier(clone(model), n_jobs=-1)
# categorical_target_encoders
# # workflow = make_pipeline(cat_feature_embedding,StandardScaler(), model)

# workflow = make_pipeline(cat_feature_embedding,StandardScaler(), model)
# # X_train_enc = cat_feature_embedding.fit_transform(X_train,y_train)
# # X_test_enc = cat_feature_embedding.transform(X_test)
# with parallel_backend('loky'):
#     y_pred = workflow.fit(X_train,y_train).predict(X_test)
#     print(classification_report(y_test,y_pred))