In [2]:
from pymongo import MongoClient
import optuna
import os

os.environ["NEPTUNE_PROJECT"] = "mlop3n/SDP"
os.environ[
    "NEPTUNE_NOTEBOOK_PATH"
] = "PycharmProjects/sdpiit/notebooks/Pipeline_components_builder.ipynb"
import warnings
from sklearnex import patch_sklearn

patch_sklearn()
import numpy as np
import pandas as pd
from category_encoders import (
    BackwardDifferenceEncoder,
    BaseNEncoder,
    BinaryEncoder,
    CatBoostEncoder,
    CountEncoder,
    GLMMEncoder,
    HelmertEncoder,
    JamesSteinEncoder,
    LeaveOneOutEncoder,
    MEstimateEncoder,
    QuantileEncoder,
    SummaryEncoder,
    TargetEncoder,
    WOEEncoder,
)
from sklearn import set_config
from sklearn.base import clone as model_clone
from sklearn.cluster import *
from sklearn.compose import *
from sklearn.cross_decomposition import *
from sklearn.decomposition import *
from sklearn.ensemble import *
from sklearn.feature_selection import *
from sklearn.gaussian_process import *
from sklearn.linear_model import *
from sklearn.metrics import *
from sklearn.model_selection import *
from sklearn.multioutput import *
from sklearn.multiclass import *
from sklearn.naive_bayes import *
from sklearn.neighbors import *
from sklearn.neural_network import *
from sklearn.pipeline import *
from sklearn.preprocessing import *
from sklearn.svm import *
from sklearn.tree import *
from sklearn.utils import *
from sklearn.dummy import *
from sklearn.semi_supervised import *
from sklearn.discriminant_analysis import *
import sklearnex, daal4py

from tqdm import tqdm, trange
from xgboost import XGBClassifier, XGBRFClassifier
from BorutaShap import BorutaShap

from sklearn.calibration import *

pd.options.plotting.backend = "plotly"
pd.options.display.max_columns = 50
set_config(display="diagram")
warnings.filterwarnings("ignore")
import pickle
from collections import defaultdict

import matplotlib.pyplot as plt
import seaborn as sns
from joblib import parallel_backend
from joblib.memory import Memory

sns.set()
from pprint import pprint
from helpers import PolynomialWrapper as PWrapper
from helpers import NestedCVWrapper as NCVWrapper
from helpers import ColumnSelectors
import sklearn

from helpers import DFCollection
from helpers import plot_mean_std_max
from helpers import CustomMetrics
import gc

%matplotlib inline
CACHE_DIR = Memory(location="../data/joblib_memory/")
# OPTUNA_DB = "postgresql+psycopg2://postgres:302492@localhost:5433/optuna"
from REDIS_CONFIG import REDIS_URL

OPTUNA_DB = REDIS_URL


def allow_stopping(func):
    def wrapper():
        try:
            value = func()
            return value
            # gc.collect()
        except KeyboardInterrupt as e:
            print("Program Stopped")
        gc.collect()

    return wrapper

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [3]:
db = DFCollection()
column_selector = ColumnSelectors()
# classifiers = [f() for f in cls_names]
dtype_info = column_selector.dtype_info
ordinal = column_selector.ordinal_cols
nominal = column_selector.nominal_cols
binary = column_selector.binary_cols
ratio = column_selector.ratio_cols


final_data = db.final_data
final_pred_data = db.final_pred_data
baseline_prediction_data = db.baseline_prediction_data
data_logit = db.data_logits
prediction_data = db.prediction_data
master_data = db.master
given_data = db.data

ordinal_data, nominal_data, binary_data, ratio_data = db.categorise_data()
nominal_categories = db.nominal_categories
ordinal_categories = db.ordinal_categories
class_labels, n_classes, class_priors = class_distribution(
    final_data.target.to_numpy().reshape(-1, 1)
)

encoder = OneHotEncoder(sparse=False, drop="first")
variance_thr = VarianceThreshold(0.001)
nominal_ohe_pipe = Pipeline(
    steps=[("ohe", encoder), ("var_th", variance_thr)],
    memory=Memory(location=CACHE_DIR),
)
ohe_nominal_data = nominal_ohe_pipe.fit_transform(
    nominal_data.drop(["nominal__v_12", "nominal__v_21"], axis=1)
)
n1df = pd.DataFrame(
    ohe_nominal_data,
    columns=nominal_ohe_pipe.get_feature_names_out(),
    index=nominal_data.index,
)


def train_test(X, y, test_size):
    """
    X_train, X_test, y_train, y_test
    """
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, random_state=10, test_size=test_size, stratify=y
    )
    return X_train, X_test, y_train, y_test

In [7]:
target = final_data.target
encoder = OneHotEncoder(sparse=False, drop="first", min_frequency=0.0001)
# nominal_data_test = final_pred_data.loc[:,nominal]
variance_thr = VarianceThreshold(0.001)
nominal_ohe_pipe = Pipeline(
    steps=[("ohe", encoder), ("var_th", variance_thr)],
    memory=Memory(location=CACHE_DIR),
)


scf = {"F": f_classif, "CHI": chi2}

scf2 = {"MIC": mutual_info_classif, "F": f_classif, "CHI": chi2}


K_BEST = 40
ALPHA = 0.05
all_selectors = {}
PERCENTILE = 5


def update_selectors(alpha_=ALPHA, k_best=K_BEST, percentile=PERCENTILE):
    global all_selectors, scf, scf2
    for criterion, _scf in scf.items():
        selectors = [
            SelectFpr(_scf, alpha=alpha_),
            SelectFdr(_scf, alpha=alpha_),
            SelectFwe(_scf, alpha=alpha_),
        ]

        for slctr in selectors:
            all_selectors[criterion + "-" + slctr.__class__.__name__] = slctr
    for criterion, _scf in scf2.items():
        selectors = [
            SelectKBest(_scf, k=K_BEST),
            SelectPercentile(_scf, percentile=PERCENTILE),
        ]
        for slctr in selectors:
            all_selectors[criterion + "-" + slctr.__class__.__name__] = slctr


# Initialize the selectors
update_selectors(alpha_=ALPHA, k_best=K_BEST)
ct = ColumnTransformer(
    transformers=[("one_hot_enc", nominal_ohe_pipe, nominal)],
    sparse_threshold=0,
    n_jobs=-1,
)
ohe_nominal_feature_selection = make_pipeline(
    ct, FeatureUnion(transformer_list=list(all_selectors.items()), n_jobs=-1)
)
# with open('../data/pipelines/ohe_nominal_features.pkl', 'wb') as fp:
#     pickle.dump(ohe_nominal_feature_selection, fp, protocol=-1)
# ohe_nominal_feature_selection
# n1df_test = pd.DataFrame

with parallel_backend("loky"):
    elite_ohe_nominal_features_train = ohe_nominal_feature_selection.fit_transform(
        final_data, target
    )
    elite_ohe_nominal_features_test = ohe_nominal_feature_selection.transform(
        final_pred_data
    )
    f_names = ohe_nominal_feature_selection.get_feature_names_out()
    elite_onf_df = pd.DataFrame(elite_ohe_nominal_features_train, columns=f_names)
    elite_onf_df = (
        elite_onf_df.transpose()
        .drop_duplicates(
            ignore_index=False,
        )
        .transpose()
    )

    f_names_t = {
        x: x.split("__")[1] + "__" + x.split("__")[3] for x in elite_onf_df.columns
    }

    # elite_ohe_nominal_features_train = elite_ohe_nominal_features_train.transpose().drop_duplicates(ignore_index=False,).transpose()
    elite_onf_df.rename(columns=f_names_t, inplace=True)
    elite_onf_df_test = pd.DataFrame(elite_ohe_nominal_features_test, columns=f_names)
    elite_onf_df_test = (
        elite_onf_df_test.transpose()
        .drop_duplicates(
            ignore_index=False,
        )
        .transpose()
    )
    elite_onf_df_test.rename(columns=f_names_t, inplace=True)

In [9]:
elite_onf_df

Unnamed: 0,one_hot_enc__v_18_1,one_hot_enc__v_18_5,one_hot_enc__v_18_7,one_hot_enc__v_18_42,one_hot_enc__v_18_52,one_hot_enc__v_18_56,one_hot_enc__v_18_66,one_hot_enc__v_18_67,one_hot_enc__v_18_69,one_hot_enc__v_20_11,one_hot_enc__v_20_30,one_hot_enc__v_20_35,one_hot_enc__v_20_42,one_hot_enc__v_20_46,one_hot_enc__v_20_59,one_hot_enc__v_21_1,one_hot_enc__v_21_2,one_hot_enc__v_21_8,one_hot_enc__v_25_3,one_hot_enc__v_25_5,one_hot_enc__v_25_12,one_hot_enc__v_25_17,one_hot_enc__v_25_21,one_hot_enc__v_25_32,one_hot_enc__v_25_42,...,one_hot_enc__v_32_26,one_hot_enc__v_32_47,one_hot_enc__v_32_54,one_hot_enc__v_4_13,one_hot_enc__v_4_14,one_hot_enc__v_4_22,one_hot_enc__v_4_32,one_hot_enc__v_4_34,one_hot_enc__v_4_37,one_hot_enc__v_4_77,one_hot_enc__v_12_14,one_hot_enc__v_18_24,one_hot_enc__v_18_28,one_hot_enc__v_18_44,one_hot_enc__v_20_45,one_hot_enc__v_20_58,one_hot_enc__v_20_60,one_hot_enc__v_20_69,one_hot_enc__v_3_16,one_hot_enc__v_3_38,one_hot_enc__v_3_80,one_hot_enc__v_32_10,one_hot_enc__v_32_50,one_hot_enc__v_4_6,one_hot_enc__v_4_55
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3791,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3792,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3793,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3794,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
final_data_ohe = pd.concat([final_data,elite_onf_df],axis=1)
final_data_ohe_test = pd.concat([final_pred_data,elite_onf_df_test],axis=1)
final_data_ohe.to_parquet('../data/final_data_ohe', engine='fastparquet',compression='brotli')
final_data_ohe_test.to_parquet('../data/final_data_ohe_test', engine='fastparquet',compression='brotli')

In [42]:
target = final_data.target

categorical_target_encoders_1 = FeatureUnion(
    transformer_list=[
        (
            "target_enc",
            PWrapper(
                TargetEncoder(cols=nominal + ordinal, drop_invariant=True, smoothing=0)
            ),
        ),
        ("woe_enc", PWrapper(WOEEncoder(cols=nominal + ordinal, drop_invariant=True))),
        (
            "jame_enc",
            PWrapper(JamesSteinEncoder(cols=nominal + ordinal, drop_invariant=True)),
        ),
    ],
    n_jobs=-1,
)

categorical_target_encoders_2 = FeatureUnion(
    transformer_list=[
        ("summary_enc", SummaryEncoder(cols=nominal + ordinal, drop_invariant=True)),
        # ('woe_enc', PWrapper(WOEEncoder(cols=nominal+ordinal,drop_invariant=True))),
        # ('backward_diff', BackwardDifferenceEncoder(cols=nominal+ordinal,drop_invariant=True)),
        (
            "glmm_enc",
            PWrapper(GLMMEncoder(cols=nominal + ordinal, drop_invariant=True)),
        ),
    ],
    n_jobs=-1,
)
# feature_selector = RFECV(estimator=LogisticRegression(max_iter=1000000,random_state=10), scoring='f1_macro', cv=RepeatedStratifiedKFold(n_repeats=2), step=6,n_jobs=-1)
feature_selector = RFECV(
    estimator=DecisionTreeClassifier(random_state=10),
    scoring="f1_macro",
    cv=RepeatedStratifiedKFold(n_repeats=2),
    step=1,
    n_jobs=-1,
)


cat_feature_embedding_1 = ColumnTransformer(
    transformers=[
        ("cat_enc_1", categorical_target_encoders_1, nominal + ordinal),
    ],
    sparse_threshold=0,
    n_jobs=-1,
)
cat_feature_embedding_2 = ColumnTransformer(
    transformers=[
        ("cat_enc_2", categorical_target_encoders_2, nominal + ordinal),
    ],
    sparse_threshold=0,
    n_jobs=-1,
)

# feature_selection = FeatureUnion(transformer_list=list(all_selectors.items()),n_jobs=-1)

# f_gen_workflow_1 = make_pipeline(cat_feature_embedding_1,clone(feature_selector))
# f_gen_workflow_2 = make_pipeline(cat_feature_embedding_2,clone(feature_selector))
ohe_features = make_column_selector(pattern='one_hot_enc*')
ohe_passthrough = make_column_transformer(('passthrough',ohe_features))
cat_features_best = FeatureUnion(
    transformer_list=[
        ('ohe', ohe_passthrough),
        ("set1_cat_encs", cat_feature_embedding_1),
        ("set2_cat_encs", cat_feature_embedding_2),
    ],
    n_jobs=-1,
)

cat_features_la_creme = make_pipeline(cat_features_best, feature_selector)
cat_features_with_ohe = cat_features_la_creme

In [28]:
with open('../transformers/cat_features_with_ohe','wb') as fp:
    pickle.dump(cat_features_with_ohe,fp)

In [43]:
with parallel_backend('loky'):
    X_t = cat_features_with_ohe.fit_transform(X_train,y_train)

In [44]:
test_x_t = cat_features_with_ohe.transform(X_test)

In [94]:
clf = XGBClassifier()
clf.load_model(fname='../models/xgb_clf_ohe_cat.json')
clf2 = clone(clf)
# clf2 = HistGradientBoostingClassifier(max_iter=300,categorical_features=cats,random_state=9)
# clf2 = OneVsRestClassifier(clf2_, n_jobs=-1)
X_train, X_test, y_train, y_test = train_test(final_data_ohe.drop(['target'],axis=1).loc[:,nominal+ordinal], target, test_size=0.2)
# f1_scores = sklearn.metrics.f1_score(clf2.fit(X_train.to_numpy(),y_train,eval_set=[(X_test.to_numpy(),y_test)]).predict(X_test.to_numpy()), y_test, average='macro')
with parallel_backend('loky'):
    f1_scores = sklearn.metrics.f1_score(clf2.fit(X_train.to_numpy(),y_train).predict(X_test.to_numpy()), y_test, average='macro')
f1_scores

0.6277407514730029

In [105]:
f1_scores = sklearn.metrics.f1_score(tmp.fit(X_train,y_train).predict(X_test), y_test, average='macro')

In [106]:
f1_scores

0.6528894221917477

In [97]:
X_t.shape

(3036, 57)

In [98]:
clf = XGBClassifier()
clf.load_model(fname='../models/xgb_clf_ohe_cat.json')
clf2 = clone(clf)
tmp = Pipeline([
    ('categorical_features_with_ohe', cat_features_with_ohe),
    ('estimator', clf2)
], memory=Memory('../data/joblib_memory/'))


In [107]:
import joblib
joblib.dump(tmp,'../models/clf_xgb_ohe_cat')

['../models/clf_xgb_ohe_cat']

{'memory': Memory(location=../data/joblib_memory/joblib),
 'steps': [('categorical_features_with_ohe',
   Pipeline(steps=[('featureunion',
                    FeatureUnion(n_jobs=-1,
                                 transformer_list=[('ohe',
                                                    ColumnTransformer(transformers=[('passthrough',
                                                                                     'passthrough',
                                                                                     <sklearn.compose._column_transformer.make_column_selector object at 0x7f166164faf0>)])),
                                                   ('set1_cat_encs',
                                                    ColumnTransformer(n_jobs=-1,
                                                                      sparse_threshold=0,
                                                                      transformers=[('cat_enc_1',
                                              

In [104]:
# X_train, X_test, y_train, y_test = train_test(final_data_ohe.drop(['target'],axis=1), target, test_size=0.2)
with parallel_backend('loky'):
    tmp.fit(X=X_train,y=y_train)

In [18]:
# workflow
"""
TEST
"""
target = final_data.target
X_train, X_test, y_train, y_test = train_test(final_data, target, test_size=0.2)

ohe_ref_train = X_train.index
ohe_ref_test = X_test.index
ohe_data_train = elite_onf_df.loc[ohe_ref_train,:].to_numpy()
ohe_data_test = elite_onf_df.loc[ohe_ref_test,:].to_numpy()


with parallel_backend("loky"):
    X_train_enc = cat_features_la_creme.fit_transform(X_train, y_train)
    X_test_enc = cat_features_la_creme.transform(X_test)
# cat_features_la_creme
X_train_complete = np.c_[ohe_data_train,X_train_enc]
X_test_complete = np.c_[ohe_data_test,X_test_enc]