In [7]:
from pymongo import MongoClient
import optuna
import os

os.environ["NEPTUNE_PROJECT"] = "mlop3n/SDP"
os.environ[
    "NEPTUNE_NOTEBOOK_PATH"
] = "PycharmProjects/sdpiit/notebooks/Pipeline_components_builder.ipynb"
import warnings
from sklearnex import patch_sklearn

patch_sklearn()
import numpy as np
import pandas as pd
from category_encoders import (
    BackwardDifferenceEncoder,
    BaseNEncoder,
    BinaryEncoder,
    CatBoostEncoder,
    CountEncoder,
    GLMMEncoder,
    HelmertEncoder,
    JamesSteinEncoder,
    LeaveOneOutEncoder,
    MEstimateEncoder,
    QuantileEncoder,
    SummaryEncoder,
    TargetEncoder,
    WOEEncoder,
)
from sklearn import set_config
from sklearn.base import clone as model_clone
from sklearn.cluster import *
from sklearn.compose import *
from sklearn.cross_decomposition import *
from sklearn.decomposition import *
from sklearn.ensemble import *
from sklearn.feature_selection import *
from sklearn.gaussian_process import *
from sklearn.linear_model import *
from sklearn.metrics import *
from sklearn.model_selection import *
from sklearn.multioutput import *
from sklearn.multiclass import *
from sklearn.naive_bayes import *
from sklearn.neighbors import *
from sklearn.neural_network import *
from sklearn.pipeline import *
from sklearn.preprocessing import *
from sklearn.svm import *
from sklearn.tree import *
from sklearn.utils import *
from sklearn.dummy import *
from sklearn.semi_supervised import *
from sklearn.discriminant_analysis import *
import sklearnex, daal4py

from tqdm import tqdm, trange
from xgboost import XGBClassifier, XGBRFClassifier
from BorutaShap import BorutaShap

from sklearn.calibration import *
import joblib
pd.options.plotting.backend = "plotly"
pd.options.display.max_columns = 50
set_config(display="diagram")
warnings.filterwarnings("ignore")
import pickle
from collections import defaultdict

import matplotlib.pyplot as plt
import seaborn as sns
from joblib import parallel_backend
from joblib.memory import Memory

sns.set()
from pprint import pprint
from helpers import PolynomialWrapper as PWrapper
from helpers import NestedCVWrapper as NCVWrapper
from helpers import ColumnSelectors
import sklearn

from helpers import DFCollection
from helpers import plot_mean_std_max
from helpers import CustomMetrics
import gc

%matplotlib inline
CACHE_DIR = Memory(location="../data/joblib_memory/")
# OPTUNA_DB = "postgresql+psycopg2://postgres:302492@localhost:5433/optuna"
from REDIS_CONFIG import REDIS_URL

OPTUNA_DB = REDIS_URL


def allow_stopping(func):
    def wrapper():
        try:
            value = func()
            return value
            # gc.collect()
        except KeyboardInterrupt as e:
            print("Program Stopped")
        gc.collect()

    return wrapper

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [6]:
db = DFCollection()
column_selector = ColumnSelectors()
# classifiers = [f() for f in cls_names]
dtype_info = column_selector.dtype_info
ordinal = column_selector.ordinal_cols
nominal = column_selector.nominal_cols
binary = column_selector.binary_cols
ratio = column_selector.ratio_cols


final_data = db.final_data
final_pred_data = db.final_pred_data
baseline_prediction_data = db.baseline_prediction_data
data_logit = db.data_logits
prediction_data = db.prediction_data
master_data = db.master
given_data = db.data

ordinal_data, nominal_data, binary_data, ratio_data = db.categorise_data()
nominal_categories = db.nominal_categories
ordinal_categories = db.ordinal_categories
class_labels, n_classes, class_priors = class_distribution(
    final_data.target.to_numpy().reshape(-1, 1)
)

encoder = OneHotEncoder(sparse=False, drop="first")
variance_thr = VarianceThreshold(0.001)
nominal_ohe_pipe = Pipeline(
    steps=[("ohe", encoder), ("var_th", variance_thr)],
    memory=Memory(location=CACHE_DIR),
)
ohe_nominal_data = nominal_ohe_pipe.fit_transform(
    nominal_data.drop(["nominal__v_12", "nominal__v_21"], axis=1)
)
n1df = pd.DataFrame(
    ohe_nominal_data,
    columns=nominal_ohe_pipe.get_feature_names_out(),
    index=nominal_data.index,
)

final_data_ohe = pd.read_parquet('../data/final_data_ohe')
final_data_ohe_test = pd.read_parquet('../data/final_data_ohe_test')

def train_test(X, y, test_size):
    """
    X_train, X_test, y_train, y_test
    """
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, random_state=10, test_size=test_size, stratify=y
    )
    return X_train, X_test, y_train, y_test

In [31]:
# joblib.dump(tmp,'../models/clf_xgb_ohe_cat')
clf_xgb_ohe_cat = joblib.load('../models/clf_xgb_ohe_cat')
clf_hist_ord_nom_cat = joblib.load('../models/clf_hist_ord_nom_cat')

In [9]:
clf_xgb_ohe_cat

In [32]:
target = final_data_ohe.target
X_train, X_test, y_train, y_test = train_test(final_data_ohe, target, test_size=0.3)
with parallel_backend('loky'):
    clf_hist_ord_nom_cat.fit(X_train,y_train)

In [28]:
hist_params={
    "random_state":10,
    'early_stopping': True,
    'l2_regularization': 0.9108730609384752,
    'learning_rate': 0.28848551291300845,
    'max_depth': 2,
    'max_iter': 453,
    'max_leaf_nodes': 176,
    'min_samples_leaf': 360,
    'validation_fraction': 0.24074025944392363,
    'scoring': 'f1_macro',

}

c_select = make_column_selector(pattern='ordinal*|nominal*')
clf = HistGradientBoostingClassifier(**hist_params)
ct = make_column_transformer((('passthrough',c_select)),sparse_threshold=0)
wf = make_pipeline(ct,clf)
wf

In [39]:
est = StackingClassifier([("XGB",clf_hist_ord_nom_cat),("HIST",clf_xgb_ohe_cat)],final_estimator=RandomForestClassifier(random_state=42))

In [38]:
clf_xgb_ohe_cat.set_params(estimator__objective='multi:softproba')

In [None]:
est.fit(X_train,y_train)

________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one...
_fit_transform_one(Pipeline(steps=[('featureunion',
                 FeatureUnion(n_jobs=-1,
                              transformer_list=[('ohe',
                                                 ColumnTransformer(transformers=[('passthrough',
                                                                                  'passthrough',
                                                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7f70ad745160>)])),
                                                ('set1_cat_encs',
                                                 ColumnTransformer(n_jobs=-1,
                                                             ..., 
       binary__v_1  binary__v_11  binary__v_14  binary__v_26  binary__v_27  \
index                                                       

In [None]:
y_pred = est.predict(X_test)
score = f1_score(y_test,y_pred,average='macro')

In [None]:
print(score)