In [None]:
from pymongo import MongoClient
import os
os.environ['NEPTUNE_PROJECT']="mlop3n/SDP"
os.environ['NEPTUNE_NOTEBOOK_PATH']="PycharmProjects/sdpiit/notebooks/Pipeline_components_builder.ipynb"
import warnings
from sklearnex import patch_sklearn

patch_sklearn()
import numpy as np
import pandas as pd
from category_encoders import (
    BackwardDifferenceEncoder,
    BaseNEncoder,
    BinaryEncoder,
    CatBoostEncoder,
    CountEncoder,
    GLMMEncoder,
    HelmertEncoder,
    JamesSteinEncoder,
    LeaveOneOutEncoder,
    MEstimateEncoder,
    QuantileEncoder,
    SummaryEncoder,
    TargetEncoder,
    WOEEncoder,
)
from sklearn import set_config
from sklearn.base import clone as model_clone
from sklearn.cluster import *
from sklearn.compose import *
from sklearn.cross_decomposition import *
from sklearn.decomposition import *
from sklearn.ensemble import *
from sklearn.feature_selection import *
from sklearn.gaussian_process import *
from sklearn.linear_model import *
from sklearn.metrics import *
from sklearn.model_selection import *
from sklearn.multioutput import *
from sklearn.multiclass import *
from sklearn.naive_bayes import *
from sklearn.neighbors import *
from sklearn.neural_network import *
from sklearn.pipeline import *
from sklearn.preprocessing import *
from sklearn.svm import *
from sklearn.tree import *
from sklearn.utils import *
from sklearn.dummy import *
from sklearn.semi_supervised import *
from sklearn.discriminant_analysis import *
import sklearnex, daal4py

from tqdm import tqdm, trange
from xgboost import XGBClassifier, XGBRFClassifier
from BorutaShap import BorutaShap

from sklearn.calibration import *
pd.options.plotting.backend = "plotly"
pd.options.display.max_columns = 50
set_config(display="diagram")
warnings.filterwarnings("ignore")
import pickle
from collections import defaultdict

import matplotlib.pyplot as plt
import seaborn as sns
from joblib import parallel_backend
from joblib.memory import Memory

sns.set()
from pprint import pprint
from helpers import PolynomialWrapper as PWrapper
from helpers import NestedCVWrapper as NCVWrapper
from helpers import ColumnSelectors
import sklearn
from helpers import DFCollection
from helpers import plot_mean_std_max
from helpers import CustomMetrics

CACHE_DIR = Memory(location='../data/joblib_memory/')
OPTUNA_DB = "postgresql+psycopg2://postgres:302492@localhost:5433/optuna"

In [None]:
db = DFCollection()
column_selector = ColumnSelectors()
# classifiers = [f() for f in cls_names]
dtype_info = column_selector.dtype_info
ordinal = column_selector.ordinal_cols
nominal = column_selector.nominal_cols
binary = column_selector.binary_cols
ratio = column_selector.ratio_cols


final_data = db.final_data
final_pred_data = db.final_pred_data
baseline_prediction_data = db.baseline_prediction_data
data_logit = db.data_logits
prediction_data = db.prediction_data
master_data = db.master
given_data = db.data

ordinal_data, nominal_data, binary_data, ratio_data = db.categorise_data()
nominal_categories = db.nominal_categories
ordinal_categories = db.ordinal_categories
class_labels, n_classes, class_priors = class_distribution(final_data.target.to_numpy().reshape(-1,1))

def gen_balanced_trained_test(data, p):
    Y = data.target
    X_2 = Y_2 = Y[Y == 2].index
    X_0 = Y_0 = Y[Y == 0].index
    X_1 = Y_1 = Y[Y == 1].index
    train_size = int(p * Y_2.shape[0])
    test_size = int((1 - p) * Y_2.shape[0])

    train_idx_2 = np.random.choice(Y_2, (train_size,))
    train_idx_1 = np.random.choice(Y_1, (train_size,))
    train_idx_0 = np.random.choice(Y_0, (train_size,))
    train_idx = np.r_[train_idx_0, train_idx_1, train_idx_2]
    # train_idx.shape

    test_idx_2 = np.random.choice(np.setdiff1d(Y_2, train_idx_2), (test_size,))
    test_idx_1 = np.random.choice(np.setdiff1d(Y_1, train_idx_1), (test_size,))
    test_idx_0 = np.random.choice(np.setdiff1d(Y_0, train_idx_0), (test_size,))
    test_idx = np.r_[test_idx_0, test_idx_1, test_idx_2]
    # test_idx.shape
    return train_idx, test_idx


def gen_nominal_maps(bs: pd.DataFrame = master_data) -> tuple[defaultdict, defaultdict]:
    nominal_master_db = bs.loc[:, nominal]
    nominal_cont_map = defaultdict(dict)
    nominal_indvl_map = defaultdict(dict)
    for c in nominal:
        un = sorted(nominal_master_db[c].unique().tolist())
        n = len(un)
        new_id = list(range(n))
        nominal_indvl_map[c] = dict(zip(un, new_id))
    start = 0
    for c in nominal:
        un = sorted(nominal_master_db[c].unique().tolist())
        n = len(un)
        new_id = list(range(start, start + n))
        nominal_cont_map[c] = dict(zip(un, new_id))
        start += n
    return nominal_indvl_map, nominal_cont_map


# nominal_indvl_map, nominal_cont_map = gen_nominal_maps()
# nominal_master_db = bs.loc[:, nominal]

# nominal_master_db_indvl = nominal_master_db.copy()
# nominal_master_db_cont = nominal_master_db.copy()


# nominal_indvl_map
def nm_indvl_data_trnsform(row):
    for c in nominal:
        curr = row[c]
        row[c] = nominal_indvl_map[c][curr]
    return row


# test1_nominal = nominal_master_db_indvl.apply(nm_indvl_data_trnsform, axis=1)


def nm_cont_data_trnsform(row):
    for c in nominal:
        curr = row[c]
        row[c] = nominal_cont_map[c][curr]
    return row


# test2_nominal = nominal_master_db_cont.apply(nm_cont_data_trnsform, axis=1)
# prediction_data = pd.read_pickle("../data/pred_data.pkl")
# est_ = [("cnb",CategoricalNB()),]


def wf_create(cat_encoder=TargetEncoder, model=None):
    """
    :param cat_encoder: category_encoders
    :param model: scikit-learn Model
    :return pipe: sklearn.pipeline.Pipline
    Examples of model param:

    model = ComplementNB(norm=True,fit_prior=True,)
    model = MultinomialNB()
    model = LogisticRegression(n_jobs=-1, max_iter=10000,random_state=19)
    """
    _steps = []
    encoder__name = cat_encoder.__class__.__name__
    _steps.append(
        ("PW" + encoder__name, PolynomialWrapper(feature_encoder=cat_encoder))
    )
    if model is None:
        passordinal_columns
    else:
        model__name = model.__class__.__name__
        _steps.append((model__name, model))
    pipe = Pipeline(steps=_steps)
    return pipe

In [None]:
classifiers = [
    AdaBoostClassifier(),
    BaggingClassifier(),
    BernoulliNB(),
    CalibratedClassifierCV(),
    CategoricalNB(),
    ComplementNB(),
    DecisionTreeClassifier(),
    DummyClassifier(),
    ExtraTreeClassifier(),
    ExtraTreesClassifier(),
    GaussianNB(),
    GaussianProcessClassifier(),
    GradientBoostingClassifier(),
    HistGradientBoostingClassifier(),
    KNeighborsClassifier(),
    LabelPropagation(),
    LabelSpreading(),
    LinearDiscriminantAnalysis(),
    LinearSVC(),
    LogisticRegression(),
    LogisticRegressionCV(),
    MLPClassifier(),
    MultinomialNB(),
    NearestCentroid(),
    NuSVC(),
    XGBClassifier(),
    XGBRFClassifier(),
    PassiveAggressiveClassifier(),
    Perceptron(),
    QuadraticDiscriminantAnalysis(),
    RadiusNeighborsClassifier(),
    RandomForestClassifier(),
    RidgeClassifier(),
    RidgeClassifierCV(),
    SGDClassifier(),
    SVC(),
]


# combiners = [sklearn.multioutput.ClassifierChain,
#  sklearn.multioutput.MultiOutputClassifier,
#  sklearn.multiclass.OneVsOneClassifier,
#  sklearn.multiclass.OneVsRestClassifier,
#  sklearn.multiclass.OutputCodeClassifier,
#  sklearn.ensemble._stacking.StackingClassifier,
#  sklearn.ensemble._voting.VotingClassifier
# ]

sns.set()


```python
Template
X = final_data.loc[:,categories]
y = final_data.target.to_numpy().reshape(-1,1)
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=10, test_size=0.3)
```

In [None]:
# ohe = OneHotEncoder(min_frequency=0.0001, handle_unknown='infrequent_if_exist', sparse=False,dtype=np.int32)
# X_train_t = ohe.fit_transform(nominal_data)
reports = []
fig, ax = plt.subplots(3,1,figsize=(10,10))

def analyze_model(ax=ax,i = i,X=X,y=y, pipe=pipe, feature_names=categories):
    with parallel_backend('multiprocessing'):
        cv_model = cross_validate(pipe, X, y, cv = cv_,return_train_score=True,n_jobs=-1)
        ax[i].plot(np.arange(15), cv_model['test_score'], label=f'{feature_names[0].split("__")[0].upper()}  Data')
        ax[i].legend()
        ax[2].plot(np.arange(15), cv_model['test_score'], label=f'{feature_names[0].split("__")[0].upper()}  Data')
        ax[2].legend()

i = 0
for categories in [nominal, ordinal]:
    X = final_data.loc[:,categories]
    y = final_data.target.to_numpy().reshape(-1,1)
    X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=10, test_size=0.3)
    feature_names = categories
    model = QuadraticDiscriminantAnalysis(priors=class_priors,store_covariance=True,reg_param=0.0001 )
    cv_= RepeatedStratifiedKFold(n_splits=3,n_repeats=5, random_state=10)
    pipe =  Pipeline(steps=[('polynomialwrapper',
                 PWrapper(feature_encoder=WOEEncoder())),
                (model.__class__.__name__,
                 QuadraticDiscriminantAnalysis(priors=class_priors,
                                               reg_param=0.0001,
                                               store_covariance=True))], memory=CACHE_DIR)
    
    analyze_model(ax=ax,i = i,X=X,y=y, pipe=pipe, feature_names=categories)
    y_pred = model.fit(X_train, y_train).predict(X_test)
    print(classification_report(y_test,y_pred, output_dict=False))
    c_report = classification_report(y_test,y_pred, output_dict=True)
    reports.append(c_report)
    # break
    i+=1

In [None]:
ordinal_proc_data = data_logit.drop(binary+nominal+ratio+['label'], axis=1)

# reports
ordinal_proc_data.corrwith(final_data.target,method='kendall').sort_index().plot()

# feature_correls = ordinal_proc_data.corrwith(final_data.target, method='kendall').sort_values(ascending=False).plot()
ordinal_proc_data.loc[:,ordinal] = ordinal_proc_data.loc[:,ordinal].astype('category')

In [None]:
# ordinal_data
cols = ordinal
reports = []
transformed_f = {}
from category_encoders import OneHotEncoder as OHE
category_encoders = [TargetEncoder(cols = ordinal), 
                     WOEEncoder(cols = ordinal),
                     JamesSteinEncoder(cols = ordinal),
                     HelmertEncoder(cols = ordinal),
                     # category_encoders(handle_unknown='infrequent_if_exist',min_frequency=0.0001,sparse=False,drop='first'),
                     SummaryEncoder(cols = ordinal),
                     LeaveOneOutEncoder(cols = ordinal),
                     BackwardDifferenceEncoder(cols = ordinal)]
for ce in category_encoders:
    fail = 0
    encoder = PWrapper(ce)
    try:
        f_tra = encoder.fit_transform(ordinal_proc_data,final_data.target)
    except:
        print(f'{ce} failed')
        print('Trying to run Solo')
        fail=1
    try:
        if fail == 1:
            f_tra = ce.fit_transform(ordinal_proc_data,final_data.target)
    except:
        print("Even Solo Failed !!! SKIPPING!!!!!")
        fail=0
        continue
    fcorr = f_tra.corrwith(final_data.target,method='kendall').sort_values(ascending=False)
    reports.append(fcorr)
    fail=0
    transformed_f[ce.__class__.__name__]= f_tra 


In [None]:
threshold = 0.25
for i in range(7):
    p_l: pd.Series = reports[i]
    print(list(transformed_f.keys())[i])
    print(p_l[(p_l>threshold)|(p_l<-threshold)].index)
    print(p_l[(p_l>threshold)|(p_l<-threshold)].shape)
    print(p_l[(p_l>threshold)|(p_l<-threshold)].mean())
    print(p_l[(p_l>threshold)|(p_l<-threshold)])
    print()
    print('-'*40)
# plt.hlines(0,)

In [None]:
def chosen_metrics(y_pred,y_test,chosen_encoder=None,chosen_pipe=None):
    print('-'*90)
    print(chosen_encoder)
    print('-'*30)
    print(classification_report(y_test, y_pred))
    print('-'*90)
    print("cohen_kappa_score\t|")
    print('-'*30)
    print(sklearn.metrics.cohen_kappa_score(y_pred, y_test))
    print('-'*90)
    print("balanced_accuracy_score\t|")
    print('-'*30)
    print(sklearn.metrics.balanced_accuracy_score(y_test, y_pred))
    print('-'*90)
    print("accuracy_score\t|")
    print('-'*30)
    print(
        sklearn.metrics.accuracy_score(
            y_test,
            y_pred,
        )
        * 0.85
    )
    print('-'*90)
    print("f1_score_micro\t|")
    print('-'*30)
    print(sklearn.metrics.f1_score(y_test, y_pred, average="micro"))
    print('-'*90)
    print("f1_score_macro\t|")
    print('-'*30)
    print(sklearn.metrics.f1_score(y_test, y_pred, average="macro"))
    print('-'*90)
    print("f1_score_weighted\t|")
    print('-'*30)
    print(sklearn.metrics.f1_score(y_test, y_pred, average="weighted"))
    print('-'*90)

    
category_encoders = [
    PWrapper(TargetEncoder(cols = ordinal,smoothing=0)), 
    PWrapper(WOEEncoder(cols = ordinal,regularization=0.00000001)),
    PWrapper(JamesSteinEncoder(cols = ordinal,model='pooled')),
    HelmertEncoder(cols = ordinal),
    # OrdinalEncoder(),
    OHE(cols=ordinal,handle_missing='indicator',handle_unknown='indicator',),
    SummaryEncoder(cols = ordinal,quantiles=np.linspace(0.01,1,num=20),m=0),
    PWrapper(LeaveOneOutEncoder(cols = ordinal)),
    BackwardDifferenceEncoder(cols = ordinal)
]


numeric_prob =list( set(X.columns) - set(ordinal))
cat_ct = []
for encoder in category_encoders:
    c_ct = ColumnTransformer([
        # ('cat_encoder', PWrapper(encoder),ordinal),
        ('cat_encoder', encoder,ordinal),
        ('numeric',MaxAbsScaler(),numeric_prob)
    ], remainder='drop',n_jobs=-1,sparse_threshold=0,)
    cat_ct.append(c_ct)


# model = LogisticRegressionCV(n_jobs=-1,fit_intercept=False,max_iter=10000000,random_state=0,scoring='f1_macro')
# model = XGBRFClassifier(n_jobs=-1,max_bin=256,verbosity=0,tree_method='exact',)

ovo_model = OneVsOneClassifier(estimator=model, n_jobs=-1) 
ovr_model = OneVsRestClassifier(estimator=model, n_jobs=-1)
# Loading Data
X = data_logit.drop(binary+nominal+ratio+['label'], axis=1)
y = final_data.target.to_numpy().reshape(-1,1)
# Numeric Columns Isolator
# c_ct
final_model = ovo_model
try:
    with parallel_backend('loky'):
        for i, ct in tqdm(enumerate(cat_ct)):
            curr_flow = Pipeline([("Column_transformer",ct),
                            ("model",final_model)], memory=CACHE_DIR)
            y_pred = curr_flow.fit(X_train, y_train).predict(X_test)
            chosen_metrics(y_test,y_pred,
                           chosen_encoder=category_encoders[i].__class__.__name__,
                          chosen_pipe=curr_flow)
            
except KeyboardInterrupt:
    print('STOPPED')
        

In [None]:
import neptune.new as neptune
import neptune.new.integrations.sklearn as npt_utils
import neptune.new.integrations.optuna as optuna_utils

# A default connection mode is the asynchronous mode
# Other possible values are "async", "sync", "offline", "read-only", and "debug"
CONNECTION_MODE = "offline"
run = neptune.init(project='mlop3n/SDP',custom_run_id='CAT_ENC_CLF__1.0',description="Experiment: Check Influence of Categorical Encoding on Predictive Performance of Vanilla Estimators", mode=CONNECTION_MODE)
neptune_callback = optuna_utils.NeptuneCallback(run)

run.stop()

In [None]:
import optuna

In [None]:

class ModelDB:
    __model = XGBRFClassifier(n_jobs=-1,verbosity=0,tree_method='approx',learning_rate=1, random_state=42,base_score=0.82002,importance_type='total_gain', num_parallel_trees=50,subsample=1.0, grow_policy=1)
    class_labels, n_classes, __priors = class_distribution(final_data.target.to_numpy().reshape(-1,1))
    
    def __init__(self):
        self.default_base_estimator = model_clone(self.__model)
        self.help = 0
        self.db ={}
        self._defaults = dict(random_state=42, n_jobs=-1)
        self.clfs = [
            AdaBoostClassifier(base_estimator=self.__model,random_state=42),
            BaggingClassifier(base_estimator=self.__model,**self._defaults),
            # Init signature: BernoulliNB(alpha=1.0, binarize=0.0, fit_prior=True, class_prior=None)
            BernoulliNB(class_prior=self.__priors),
            CalibratedClassifierCV(),
            CategoricalNB(),
            ComplementNB(),
            DecisionTreeClassifier(),
            DummyClassifier(),
            ExtraTreeClassifier(),
            ExtraTreesClassifier(),
            GaussianNB(),
            GaussianProcessClassifier(),
            GradientBoostingClassifier(),
            HistGradientBoostingClassifier(),
            KNeighborsClassifier(),
            LabelPropagation(),
            LabelSpreading(),
            LinearDiscriminantAnalysis(),
            LinearSVC(),
            LogisticRegression(),
            LogisticRegressionCV(),
            MLPClassifier(),
            MultinomialNB(),
            NearestCentroid(),
            NuSVC(),
            XGBClassifier(),
            XGBRFClassifier(),
            PassiveAggressiveClassifier(),
            Perceptron(),
            QuadraticDiscriminantAnalysis(),
            RadiusNeighborsClassifier(),
            RandomForestClassifier(),
            RidgeClassifier(),
            RidgeClassifierCV(),
            SGDClassifier(),
            SVC(),
        ]
    def __call__(self, model_name = None):
        pass

mdb = ModelDB()
        




In [None]:
import matplotlib.pyplot as plt

In [None]:
model = XGBRFClassifier(n_jobs=-1,
                        verbosity=0,
                        n_estimators = 1000,
                        tree_method='hist',
                        # enable_categorical=True,
                        learning_rate=1,
                        random_state=42,
                        base_score=0.82002,
                        importance_type='total_gain',
                        num_parallel_trees=50,
                        subsample=1.0,
                        # objective='multi:softmax',
                        grow_policy="lossguide",
                        # max_cat_to_one_hot=1000
                       )
# model = SVC(random_state=42,probability=True,break_ties=True)
# model = LogisticRegression(fit_intercept=False, random_state=42, n_jobs=-1)
# model = RandomForestClassifier(n_jobs=-1,max_depth=30, random_state=42, ccp_alpha=0.001, max_features=None,bootstrap=False)
# model = NuSVC(probability=True, break_ties=True, random_state=42, cache_size=1000)
default_base_estimator = OneVsOneClassifier(model_clone(model), n_jobs=-1)
# default_base_estimator is wrapped with multiclass trainer

with parallel_backend('threading'):
    ccV = cross_validate(default_base_estimator, final_data.loc[:,binary],final_data.target, n_jobs=-1,cv=RepeatedStratifiedKFold(n_repeats=5,n_splits=3,random_state=42), return_train_score=True, scoring='f1_macro')
# plt.ioff
fig,ax = plt.subplots(2,1) 
ax[0].plot(ccV['test_score'],'r--', label="Test Scores")
ax[0].legend()
ax[1].plot(ccV['train_score'],  label = "Train Scores")
ax[1].legend()
disp = fig.suptitle("Train Vs Test Scores")

In [None]:
nominal_data.astype('int').skew()

In [None]:
%matplotlib widget


In [None]:
ccV

In [None]:
"""
Optuna example that optimizes a classifier configuration for Iris dataset using sklearn.
In this example, we optimize a classifier configuration for Iris dataset. Classifiers are from
scikit-learn. We optimize both the choice of classifier (among SVC and RandomForest) and their
hyperparameters.
"""

import optuna

import sklearn.datasets
import sklearn.ensemble
import sklearn.model_selection
import sklearn.svm


# FYI: Objective functions can take additional arguments
# (https://optuna.readthedocs.io/en/stable/faq.html#objective-func-additional-args).
def objective(trial):
    iris = sklearn.datasets.load_iris()
    x, y = iris.data, iris.target

    classifier_name = trial.suggest_categorical("classifier", ["SVC", "RandomForest"])
    if classifier_name == "SVC":
        svc_c = trial.suggest_float("svc_c", 1e-10, 1e10, log=True)
        classifier_obj = sklearn.svm.SVC(C=svc_c, gamma="auto")
    else:
        rf_max_depth = trial.suggest_int("rf_max_depth", 2, 32, log=True)
        classifier_obj = sklearn.ensemble.RandomForestClassifier(
            max_depth=rf_max_depth, n_estimators=10
        )

    score = sklearn.model_selection.cross_val_score(classifier_obj, x, y, n_jobs=-1, cv=3)
    accuracy = score.mean()
    return accuracy


if __name__ == "__main__":
    # study_name=study_name, storage=storage_name, load_if_exists=True
    study = optuna.create_study(study_name="SDP-IITM", storage=OPTUNA_DB,  direction="maximize")
    study.optimize(objective, n_trials=100)
    print(study.best_trial)

In [None]:
"""
Optuna example that demonstrates a pruner for CatBoost.

In this example, we optimize the validation accuracy of cancer detection using CatBoost.
We optimize both the choice of booster models and their hyperparameters. Throughout
training of models, a pruner observes intermediate results and stop unpromising trials.

You can run this example as follows:
    $ python catboost_pruning.py

"""

import numpy as np
import optuna
from optuna.integration import CatBoostPruningCallback

import catboost as cb
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


def objective(trial: optuna.Trial) -> float:
    data, target = load_breast_cancer(return_X_y=True)
    train_x, valid_x, train_y, valid_y = train_test_split(data, target, test_size=0.25)

    param = {
        "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1, log=True),
        "depth": trial.suggest_int("depth", 1, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
        "used_ram_limit": "3gb",
        "eval_metric": "Accuracy",
    }

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1, log=True)

    gbm = cb.CatBoostClassifier(**param)

    pruning_callback = CatBoostPruningCallback(trial, "Accuracy")
    gbm.fit(
        train_x,
        train_y,
        eval_set=[(valid_x, valid_y)],
        verbose=0,
        early_stopping_rounds=100,
        callbacks=[pruning_callback],
    )

    # evoke pruning manually.
    pruning_callback.check_pruned()

    preds = gbm.predict(valid_x)
    pred_labels = np.rint(preds)
    accuracy = accuracy_score(valid_y, pred_labels)

    return accuracy


if __name__ == "__main__":
    study = optuna.create_study(
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=5),
        storage=OPTUNA_DB,
        direction="maximize"
    )
    study.optimize(objective, n_trials=100, timeout=600)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))


In [None]:
"""
Optuna example that demonstrates a pruner for XGBoost.
In this example, we optimize the validation accuracy of cancer detection using XGBoost.
We optimize both the choice of booster model and their hyperparameters. Throughout
training of models, a pruner observes intermediate results and stop unpromising trials.
You can run this example as follows:
    $ python xgboost_integration.py
"""

import numpy as np
import optuna

import sklearn.datasets
import sklearn.metrics
from sklearn.model_selection import train_test_split
import xgboost as xgb


# FYI: Objective functions can take additional arguments
# (https://optuna.readthedocs.io/en/stable/faq.html#objective-func-additional-args).
def objective(trial):
    data, target = sklearn.datasets.load_breast_cancer(return_X_y=True)
    train_x, valid_x, train_y, valid_y = train_test_split(data, target, test_size=0.25)
    dtrain = xgb.DMatrix(train_x, label=train_y)
    dvalid = xgb.DMatrix(valid_x, label=valid_y)

    param = {
        "verbosity": 0,
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
    }

    if param["booster"] == "gbtree" or param["booster"] == "dart":
        param["max_depth"] = trial.suggest_int("max_depth", 1, 9)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

    # Add a callback for pruning.
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation-auc")
    bst = xgb.train(param, dtrain, evals=[(dvalid, "validation")], callbacks=[pruning_callback])
    preds = bst.predict(dvalid)
    pred_labels = np.rint(preds)
    accuracy = sklearn.metrics.accuracy_score(valid_y, pred_labels)
    return accuracy


if __name__ == "__main__":
    study = optuna.create_study(
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=5), 
        storage=OPTUNA_DB,
        direction="maximize"
    )
    study.optimize(objective, n_trials=100)
    print(study.best_trial)


In [None]:
import optuna
import logging
# optuna.logging.set_verbosity(0)
LOG_FILE_PATH = "../data/optuna.log"
# logger.setLevel(logging.INFO)  # Setup the root logger.
# optuna.logging.disable_default_handler()
logger.addHandler(logging.FileHandler(LOG_FILE_PATH, mode="w"))

optuna.logging.enable_propagation()  # Propagate logs to the root logger.
optuna.logging.disable_default_handler()  # Stop showing logs in sys.stderr.

        
study = optuna.create_study(storage=OPTUNA_DB,)

logger.info("Start optimization.")
study.optimize(objective, n_trials=10)

with open(LOG_FILE_PATH) as f:
    assert f.readline().startswith("A new study created")
    assert f.readline() == "Start optimization.\n"


In [None]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier

import optuna

X, y = load_iris(return_X_y=True)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=0)


def objective(trial):
    trial.set_user_attr("BATCHSIZE", 128)
    momentum = trial.suggest_float("momentum", 0, 1.0)
    clf = MLPClassifier(
        hidden_layer_sizes=(100, 50),
        batch_size=trial.user_attrs["BATCHSIZE"],
        momentum=momentum,
        solver="sgd",
        random_state=0,
    )
    clf.fit(X_train, y_train)

    return clf.score(X_valid, y_valid)


study = optuna.create_study(storage=OPTUNA_DB,direction="maximize")
study.optimize(objective, n_trials=30)