In [1]:
# import widen_notebook
from mysetup import NotebookFinder
import sys

sys.meta_path.append(NotebookFinder())

from setup_transform import *

%matplotlib inline
cl_weight = sklearn.utils.compute_class_weight(
    class_weight="balanced", classes=[0, 1, 2], y=y
)
CLASS_WEIGHTS = {i: cl_weight[i] for i in range(3)}
def_cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=5, random_state=42)
lgr_params = dict(
    class_weight=CLASS_WEIGHTS,
    fit_intercept=False,
    multi_class="ovr",
    max_iter=2000000,
    random_state=42,
    n_jobs=24,
    #     penalty="elasticnet",
    cv=def_cv,
    scoring="f1_macro",
    solver="lbfgs",
    Cs=100,
    #     l1_ratios=np.linspace(0, 1, endpoint=False, num=100),
)
from sklearnex import unpatch_sklearn

import copy
import warnings

warnings.filterwarnings("ignore")
discrete = nominal + discrete_ordinal + discrete_binary
X_master = pd.concat(
    [raw_data.loc[:, raw_data_eval.columns], raw_data_eval], ignore_index=True, axis=0
)
class_priors = (raw_data.target.value_counts() / 3796).to_numpy()
raw_data[discrete] = raw_data[discrete].astype(np.uint32)
X_master[discrete] = X_master[discrete].astype(np.uint32)

importing Jupyter notebook from setup_transform.ipynb


Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [149]:
qda = LinearDiscriminantAnalysis(
    solver="svd", store_covariance=True, priors=class_priors
)

pca = FastICA(
    n_components=100,
    whiten="arbitrary-variance",
    random_state=42,
    max_iter=10000,
    tol=1e-09,
    fun="cube",
)
X = pd.get_dummies(raw_data[nominal], columns=nominal)
y = raw_data.target
# pca = PCA(svd_solver='full')
clf = make_pipeline(qda)

scores = cross_validate(
    clf,
    raw_data[raw_data_eval.columns],
    raw_data.target,
    scoring="f1_macro",
    n_jobs=24,
    cv=def_cv,
    return_train_score=True,
)
scores["test_score"].mean()

0.6565509085627457

In [13]:
from sklearn.utils.validation import has_fit_parameter

# has_fit_parameter(clf, "warm_start")
clf_list = sklearn.utils.all_estimators(type_filter="classifier")
props = {"warm_start": [], "partial_fit": []}
errors = []
for name, clf in clf_list:
    try:
        if "warm_start" in clf().get_params():
            props["warm_start"].append(clf)
        try:
            clf().partial_fit(
                raw_data[raw_data_eval.columns].abs(),
                raw_data.target,
                classes=[0, 1, 2],
            )
            props["partial_fit"].append(clf)
        except AttributeError:
            errors.append(name)

    except TypeError:
        errors.append(name)

In [16]:
props

{'warm_start': [sklearn.ensemble._bagging.BaggingClassifier,
  sklearn.ensemble._forest.ExtraTreesClassifier,
  sklearn.gaussian_process._gpc.GaussianProcessClassifier,
  sklearn.ensemble._gb.GradientBoostingClassifier,
  sklearn.ensemble._hist_gradient_boosting.gradient_boosting.HistGradientBoostingClassifier,
  daal4py.sklearn.linear_model.logistic_path.LogisticRegression,
  sklearn.neural_network._multilayer_perceptron.MLPClassifier,
  sklearn.linear_model._passive_aggressive.PassiveAggressiveClassifier,
  sklearn.linear_model._perceptron.Perceptron,
  daal4py.sklearn.ensemble._forest.RandomForestClassifier,
  sklearn.linear_model._stochastic_gradient.SGDClassifier],
 'partial_fit': [sklearn.naive_bayes.BernoulliNB,
  sklearn.naive_bayes.CategoricalNB,
  sklearn.naive_bayes.ComplementNB,
  sklearn.naive_bayes.GaussianNB,
  sklearn.neural_network._multilayer_perceptron.MLPClassifier,
  sklearn.naive_bayes.MultinomialNB,
  sklearn.linear_model._passive_aggressive.PassiveAggressiveClas

In [60]:
def get_logit_ordinal(X):
    probit = {c: {} for c in discrete_ordinal}

    for c in discrete_ordinal:
        vc = X_master[c].value_counts()
        N = X_master.shape[0]
        vc = vc / N
        probit_value = {val: 0 for val in X_master[c].unique()}
        for val in X_master[c].unique():
            probit_value[val] = np.log1p(
                vc[vc.index <= val].sum() / (vc[vc.index > val].sum() + 1e-09)
            )
        probit[c] = probit_value

    def transform_ordinal_to_logit(row):
        for c in discrete_ordinal:
            row[c] = probit[c][row[c]]
        return row

    probit_x = pd.DataFrame(X, columns=discrete_ordinal)
    probit_x.apply(transform_ordinal_to_logit, axis=1)
    return probit_x[discrete_ordinal].to_numpy()


base_n = BaseNEncoder(cols=nominal)
numeric = list(
    np.setdiff1d(raw_data_eval.columns, discrete_binary + discrete_ordinal + nominal)
)
tree_ready = make_column_transformer(
    (base_n, nominal),
    (
        Normalizer(),
        numeric,
    ),
    ("passthrough", discrete_binary + discrete_ordinal),
    remainder="passthrough",
)

tree_ready

linear_ready = make_column_transformer(
    #     (OneHotEncoder(sparse=False),nominal),
    (BaseNEncoder(cols=nominal), nominal),
    (
        FunctionTransformer(get_logit_ordinal, feature_names_out="one-to-one"),
        discrete_ordinal,
    ),
    (StandardScaler(), numeric),
    remainder="passthrough",
)


bayesian_prep = make_column_transformer(
    (OneHotEncoder(sparse=False), nominal),
    ("passthrough", discrete_binary + discrete_ordinal),
    (make_pipeline(FunctionTransformer(np.absolute), Normalizer()), numeric),
    remainder="drop",
    sparse_threshold=0,
)

In [59]:
X = raw_data[raw_data_eval.columns]
y = raw_data.target
X_train, X_test, y_train, y_test = gen_train_test(X, y, test_size=0.3)
c_msk = np.array([True] * 44 + [False] * 15 + [True] * 21)

c_msk = c_msk.reshape(
    80,
)
svc = SVC(
    probability=True,
    class_weight="balanced",
    random_state=42,
    tol=1e-06,
    kernel="rbf",
    gamma="scale",
    break_ties=True,
)
sgd = SGDClassifier(
    warm_start=True,
    average=True,
    random_state=42,
    fit_intercept=False,
    n_iter_no_change=50,
    loss="squared_hinge",
    max_iter=10000,
)
hgb = HistGradientBoostingClassifier(
    warm_start=True,
    categorical_features=c_msk,
    random_state=42,
    scoring="f1_macro",
    max_iter=200,
    early_stopping="auto",
    n_iter_no_change=50,
)
rfc = RandomForestClassifier(
    warm_start=True,
    min_impurity_decrease=0.00055,
    n_jobs=24,
    random_state=42,
    max_features=None,
    class_weight="balanced_subsample",
)
efc = ExtraTreesClassifier(
    warm_start=True,
    min_impurity_decrease=0.00055,
    n_jobs=24,
    random_state=42,
    max_features=None,
    class_weight="balanced_subsample",
)
ALPHA = 1.0e-10
cnb = CategoricalNB(alpha=ALPHA, fit_prior=True, min_categories=2)
conb = ComplementNB(alpha=ALPHA, fit_prior=True, norm=False)
benb = BernoulliNB(binarize=False, alpha=ALPHA, fit_prior=True)
munb = MultinomialNB(alpha=ALPHA, fit_prior=True)


clfs = []
for tree_learner in [hgb, rfc, efc]:
    clf_ = BaggingClassifier(
        base_estimator=tree_learners,
        n_estimators=50,
        random_state=42,
        n_jobs=24,
        warm_start=True,
    )
    plug = make_pipeline(linear_ready, OneVsRestClassifier(clf_))
    clfs.append((tree_learner.__class__.__name__, tree_learner))

for linear_learner in [svc, sgd]:
    clf_ = BaggingClassifier(
        base_estimator=linear_learner,
        random_state=42,
        n_jobs=24,
        warm_start=True,
    )
    plug = make_pipeline(tree_ready, OneVsRestClassifier(clf_))
    clfs.append((linear_learner.__class__.__name__, linear_learner))

for bayesian_learner in [cnb, conb, benb, munb]:
    clf_ = BaggingClassifier(
        base_estimator=linear_learner,
        random_state=42,
        n_jobs=24,
        warm_start=True,
    )
    plug = make_pipeline(bayesian_prep, OneVsRestClassifier(clf_))
    clfs.append((bayesian_learner.__class__.__name__, linear_learner))

VP = VotingClassifier(
    estimators=clfs,
    voting="soft",
)
clf_ = BaggingClassifier(
    base_estimator=hgb,
    random_state=42,
    n_jobs=24,
    warm_start=True,
)
plug = make_pipeline(tree_ready, OneVsRestClassifier(clf_))
y_pred_base = plug.fit(X_train, y_train).predict(X_test)
print(classification_report(y_test, y_pred_base))


### Re- Init CLF
plug = make_pipeline(tree_ready, OneVsRestClassifier(clf_))


cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2)

for train_index, test_index in cv.split(X_train, y_train):
    X_train_, X_test_ = X.loc[train_index, :], X.loc[test_index, :]
    y_train_, y_test_ = y.loc[train_index], y.loc[test_index]
    #     plug.fit(X_test_, y_test_)
    plug.fit(X_train_, y_train_)
y_pred = plug.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.77      0.65      0.70       373
         1.0       0.72      0.94      0.81       559
         2.0       0.51      0.22      0.31       207

    accuracy                           0.71      1139
   macro avg       0.66      0.60      0.61      1139
weighted avg       0.70      0.71      0.69      1139

              precision    recall  f1-score   support

         0.0       0.87      0.83      0.85       373
         1.0       0.84      0.94      0.89       559
         2.0       0.84      0.63      0.72       207

    accuracy                           0.85      1139
   macro avg       0.85      0.80      0.82      1139
weighted avg       0.85      0.85      0.85      1139

