In [1]:
import gc
import pickle

import numpy as np
from joblib import parallel_backend
from sklearn.linear_model import (
    ARDRegression,
    BayesianRidge,
    ElasticNet,
    GammaRegressor,
    Lasso,
    LassoLars,
    LassoLarsIC,
    LinearRegression,
    PassiveAggressiveRegressor,
    PoissonRegressor,
    QuantileRegressor,
    RANSACRegressor,
    SGDRegressor,
    TheilSenRegressor,
    TweedieRegressor,
)
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PowerTransformer, QuantileTransformer, StandardScaler
from sklearn.svm import SVR, LinearSVR, NuSVR
from sklearnex import patch_sklearn
from tqdm import tqdm


def conn():
    with open("data_np.pkl", "rb") as fp:
        dpkl = pickle.load(fp)
    return dpkl


import warnings

warnings.filterwarnings("ignore")

# cdef np.ndarray X__ = conn()
X__ = conn()

patch_sklearn()


def data_n_miss(n):
    # data with x features simultaneously missing
    # cdef np.ndarray t, idx
    t = X__[:, 80]
    idx = np.where(t == n)[0]
    return X__[idx, :80]


ms = data_n_miss(1)
dcl = data_n_miss(0)

iter_ = 10000
tol = 0.000001

estimators = [
    LassoLarsIC(normalize=False, precompute=True, criterion="bic"),
    # LinearRegression(),
    # SGDRegressor(learning_rate='adaptive'),
    # ARDRegression(n_iter=1000,compute_score=True,tol=tol),
    # BayesianRidge(lambda_init=0.001,n_iter=iter_,tol=tol,compute_score=True),
    # PassiveAggressiveRegressor(C=0.5, max_iter=iter_,tol=tol,early_stopping=True,validation_fraction=0.3,n_iter_no_change=20),
    # LinearSVR(tol=tol,max_iter=iter_,random_state=0,C=0.5),
    # NuSVR(kernel='linear',tol=tol),
    SVR(kernel="sigmoid", tol=tol),
    # Lasso(precompute=True,max_iter=iter_,tol=tol,selection='random'),
    # LassoLars(precompbute=True,max_iter=iter_),RANSACRegressor(base_estimator=LassoLars(precompute=True,max_iter=iter_),max_trials=1000),TheilSenRegressor(n_jobs=-1),ElasticNet(precompute=True)
]

tweed_estimators = [
    TweedieRegressor(power=0),
    TweedieRegressor(power=1),
    TweedieRegressor(power=1.5),
    TweedieRegressor(power=2),
    TweedieRegressor(power=3),
]


def feat_n_miss():
    # Col Index of incomplete features
    # cdef np.ndarray feed, feat_idx
    global ms
    feed: np.ndarray = np.isnan(ms).sum(axis=0)
    feat_idx = np.where(feed > 0)[0]
    return feat_idx


def feat_x_nan_idx(feature, cc):
    # cc data_n_miss
    # return the indices of data missing "x" feature
    cl_ = cc[:, feature]
    return np.where(np.isnan(cl_))[0]


# # clean data


def get_clean_data(target_f, dirty_f_idx, size=0.5):
    clash = np.array([target_f, dirty_f_idx])
    d_features = np.setdiff1d(np.arange(80), clash)
    test_f = dcl[:, d_features]
    test_t = dcl[:, target_f]
    xt, xst, yt, yst = train_test_split(test_f, test_t, test_size=size, random_state=0, shuffle=True)
    return xt, yt


from collections import defaultdict

clean_results = defaultdict(dict)


def clean_learning(target_f, size=0.7, exx=estimators):
    global clean_results

    d_features = np.setdiff1d(np.arange(80), np.array(target_f))
    xf = dcl[:, d_features]
    yf = dcl[:, target_f]
    X_train, X_test, y_train, y_test = train_test_split(xf, yf, test_size=size, random_state=0, shuffle=True)

    for e in tqdm(exx):
        with parallel_backend("loky"):
            e.fit(X_train, y_train)
            y_pred = e.predict(X_test)
            safer = gc.collect()
            clean_results[target_f][e.__class__.__name__] = mean_squared_error(y_pred, y_test)
            safer = gc.collect()


def dirty_df(target_f, dirty_f_idx, ms=ms):
    # dirty_f_idx.append(target_f)
    clash = np.array([target_f, dirty_f_idx])
    # cdef np.ndarray clash
    d_features = np.setdiff1d(np.arange(80), clash)
    dirty_f_training_data_idx = feat_x_nan_idx(dirty_f_idx, ms)
    ark = ms[dirty_f_training_data_idx, :]
    ark = ark[:, d_features]
    ark_target = ms[dirty_f_training_data_idx, target_f]
    return ark, ark_target


dirty_results = defaultdict(dict)
dirty_tweed_results = defaultdict(dict)

# # @jit(parallel=True)
def roll(target_f, dirty_f_idx):
    global estimators
    dirty_results[target_f][dirty_f_idx] = {}
    x_train, y_train = dirty_df(target_f, dirty_f_idx)
    safer = gc.collect()
    # workflow = make_pipeline((), estimator)
    xtest, ytest = get_clean_data(target_f, dirty_f_idx)
    for workflow in tqdm(estimators):
        with parallel_backend("loky", n_jobs=4):
            print("=" * 30 + f"{workflow.__class__.__name__} started" + "=" * 30)
            workflow.fit(x_train, y_train)
            y_pred = workflow.predict(xtest)
            # safer = gc.collect()
            dirty_results[target_f][dirty_f_idx][workflow.__class__.__name__] = mean_squared_error(y_pred, ytest)
            # print(workflow.__class__.__name__,":",mean_squared_error(y_pred, ytest))


# def roll_tweed(target_f,dirty_f_idx):
#     global tweed_estimators
#     dirty_tweed_results[target_f][dirty_f_idx]={}
#     x_train, y_train = dirty_df(target_f,dirty_f_idx)
#     # workflow = make_pipeline((), estimator)
#     xtest, ytest = get_clean_data(target_f, dirty_f_idx)
#     for i,workflow in tqdm(enumerate(tweed_estimators)):
#         try:
#             with parallel_backend('threading',n_jobs=-1):
#                 workflow.fit(x_train, y_train)
#                 y_pred = workflow.predict(xtest)
#                 safer = gc.collect()
#                 dirty_tweed_results[target_f][dirty_f_idx][workflow.__class__.__name__+str(i)] = mean_squared_error(y_pred, ytest)
#         except:
#             continue

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
g = feat_n_miss()
yu = 2
for i in tqdm(g[:13]):
    # clean_learning(i)
    if i != yu:
        roll(yu, i)
import pandas as pd

pd.DataFrame(dirty_results[2])

  0%|                                                    | 0/13 [00:00<?, ?it/s]
  0%|                                                     | 0/2 [00:00<?, ?it/s][A




100%|█████████████████████████████████████████████| 2/2 [01:10<00:00, 35.40s/it][A
  8%|███▍                                        | 1/13 [01:11<14:15, 71.27s/it]
  0%|                                                     | 0/2 [00:00<?, ?it/s][A




100%|█████████████████████████████████████████████| 2/2 [01:09<00:00, 34.60s/it][A
 15%|██████▊                                     | 2/13 [02:20<12:53, 70.36s/it]
  0%|                                                     | 0/2 [00:00<?, ?it/s][A




100%|█████████████████████████████████████████████| 2/2 [01:11<00:00, 35.60s/it][A
 31%|█████████████▌                              | 4/13 [03:32<07:18, 48.72s/it]
  0%|                                                     | 0/2 [00:00<?, ?it/s][A




100%|█████████████████████████████████████████████| 2/2 [01:20<00:00, 40.13s/it][A
 38%|████████████████▉                           | 5/13 [04:53<07:48, 58.61s/it]
  0%|                                                     | 0/2 [00:00<?, ?it/s][A




100%|█████████████████████████████████████████████| 2/2 [01:55<00:00, 57.79s/it][A
 46%|████████████████████▎                       | 6/13 [06:49<08:53, 76.24s/it]
  0%|                                                     | 0/2 [00:00<?, ?it/s][A




100%|█████████████████████████████████████████████| 2/2 [02:04<00:00, 62.44s/it][A
 54%|███████████████████████▋                    | 7/13 [08:54<09:07, 91.20s/it]
  0%|                                                     | 0/2 [00:00<?, ?it/s][A




100%|█████████████████████████████████████████████| 2/2 [02:29<00:00, 74.72s/it][A
 62%|██████████████████████████▍                | 8/13 [11:24<09:05, 109.02s/it]
  0%|                                                     | 0/2 [00:00<?, ?it/s][A




100%|█████████████████████████████████████████████| 2/2 [02:13<00:00, 66.94s/it][A
 69%|█████████████████████████████▊             | 9/13 [13:39<07:46, 116.70s/it]
  0%|                                                     | 0/2 [00:00<?, ?it/s][A




100%|█████████████████████████████████████████████| 2/2 [02:28<00:00, 74.18s/it][A
 77%|████████████████████████████████▎         | 10/13 [16:08<06:19, 126.40s/it]
  0%|                                                     | 0/2 [00:00<?, ?it/s][A




100%|█████████████████████████████████████████████| 2/2 [02:08<00:00, 64.28s/it][A
 85%|███████████████████████████████████▌      | 11/13 [18:17<04:14, 127.23s/it]
  0%|                                                     | 0/2 [00:00<?, ?it/s][A




100%|█████████████████████████████████████████████| 2/2 [01:21<00:00, 40.85s/it][A
 92%|██████████████████████████████████████▊   | 12/13 [19:39<01:53, 113.71s/it]
  0%|                                                     | 0/2 [00:00<?, ?it/s][A




100%|█████████████████████████████████████████████| 2/2 [01:13<00:00, 36.92s/it][A
100%|███████████████████████████████████████████| 13/13 [20:53<00:00, 96.46s/it]


Unnamed: 0,0,1,3,4,5,6,7,8,9,10,11,12
LassoLarsIC,0.997421,0.995632,0.995431,0.99573,0.995443,0.995454,0.995652,0.995414,0.995987,0.995512,0.995416,0.995436
SVR,1459.760117,1338.649096,1516.337904,1469.474201,1459.642579,1321.500459,1317.010702,1392.947091,1325.283934,1391.48488,1349.613733,1402.756681


In [None]:
#!/usr/bin/env python
# coding: utf-8

import gc
import pickle
import warnings
from collections import defaultdict

import dpctl
import numpy as np
import pandas as pd
from joblib import parallel_backend
from sklearn import set_config
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.ensemble import (
    AdaBoostRegressor,
    ExtraTreesRegressor,
    GradientBoostingRegressor,
    HistGradientBoostingRegressor,
    StackingRegressor,
)
from sklearn.feature_selection import (
    SelectFromModel,
    SelectKBest,
    SelectPercentile,
    SequentialFeatureSelector,
    mutual_info_regression,
)
from sklearn.linear_model import (
    ARDRegression,
    BayesianRidge,
    ElasticNet,
    GammaRegressor,
    Lasso,
    LassoLars,
    LassoLarsIC,
    LinearRegression,
    MultiTaskElasticNet,
    MultiTaskLasso,
    PassiveAggressiveRegressor,
    PoissonRegressor,
    QuantileRegressor,
    RANSACRegressor,
    RidgeCV,
    SGDRegressor,
    TheilSenRegressor,
    TweedieRegressor,
)
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, PowerTransformer, QuantileTransformer, StandardScaler
from sklearn.svm import SVR, LinearSVR, NuSVR
from sklearn.tree import DecisionTreeRegressor
from sklearnex import patch_sklearn
from sklearnex.ensemble import RandomForestRegressor
from tqdm import tqdm
from xgboost import XGBRFRegressor as xrb

pd.options.display.max_columns = 90
pd.options.display.max_rows = 90
warnings.filterwarnings("ignore")
set_config(display="diagram")
dirty_results = defaultdict(dict)
reference_metadata = defaultdict(dict)
clean_results = defaultdict(dict)


patch_sklearn()
iter_ = 10000
tol = 0.000001


def gen_stack():
    # Category Selector
    cat_selector = make_column_selector(dtype_exclude=np.float32)
    # Number Selector
    numerical_selector = make_column_selector(dtype_exclude=np.uint8)
    # category_transformer =
    # Feature Selector
    sel = SelectFromModel(estimator=ElasticNet(precompute=True), threshold="median")
    numeric_scaler = StandardScaler()
    cat_scaler = OneHotEncoder(sparse=True)
    linear_prep = ColumnTransformer(
        transformers=[("num", numeric_scaler, numerical_selector), ("categ", cat_scaler, cat_selector)]
    )
    tree_prep = ColumnTransformer(
        transformers=[("num", numeric_scaler, numerical_selector), ("categ", OrdinalEncoder, cat_selector)]
    )
    lasso_linear_prep = ColumnTransformer(transformers=[("num", numeric_scaler, numerical_selector)])
    modis = [
        make_pipeline(lasso_linear_prep, sel, LassoLarsIC(normalize=False, precompute=True, criterion="bic")),
        make_pipeline(lasso_linear_prep, sel, ARDRegression(n_iter=1000, compute_score=True, tol=tol)),
        make_pipeline(linear_prep, sel, BayesianRidge(lambda_init=0.001, n_iter=iter_, tol=tol, compute_score=True)),
        make_pipeline(linear_prep, sel, Lasso(precompute=True, max_iter=iter_, tol=tol, selection="cyclic")),
        make_pipeline(linear_prep, sel, LassoLars(precompute=True, max_iter=iter_)),
        make_pipeline(linear_prep, sel, TweedieRegressor(power=0)),
        make_pipeline(
            linear_prep,
            sel,
            RANSACRegressor(
                min_samples=500,
                base_estimator=LassoLarsIC(normalize=False, precompute=True, criterion="aic"),
                max_trials=10000,
            ),
        ),
        make_pipeline(
            linear_prep,
            sel,
            ElasticNet(
                precompute=True,
            ),
        ),
        make_pipeline(tree_prep, sel, HistGradientBoostingRegressor(max_iter=1000, max_depth=500)),
        make_pipeline(tree_prep, sel, GradientBoostingRegressor(random_state=0, max_depth=30)),
        make_pipeline(tree_prep, sel, DecisionTreeRegressor()),
        make_pipeline(
            tree_prep,
            sel,
            ExtraTreesRegressor(n_jobs=-1),
        ),
        make_pipeline(tree_prep, sel, AdaBoostRegressor(base_estimator=Lasso(precompute=True))),
    ]
    stacked_estimators = []
    for q in modis:
        ename = q[2].__class__.__name__
        stacked_estimators.append((ename, q))
    learning_stack = StackingRegressor(estimators=stacked_estimators, cv=3, n_jobs=-1, final_estimator=RidgeCV())
    return learning_stack


with open("data.pkl", "rb") as fp:
    df = pickle.load(fp)

trd = df[df.missing_cols == 0].copy()
X_y = trd.drop(["missing_cols"], axis=1)


def save_pipeline(c, p):
    with open(f"stacking_models/stack_{c}.pkl", "wb+") as fp:
        pickle.dump(p, fp)


def get_data_feed(c):
    X = X_y.drop([c], axis=1)
    y = X_y[c]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    return X_train, X_test, y_train, y_test


# if
#
trgs = [
    "F_1_0",
    "F_1_1",
    "F_1_2",
    "F_1_3",
    "F_1_4",
    "F_1_5",
    "F_1_6",
    "F_1_7",
    "F_1_8",
    "F_1_9",
    "F_1_10",
    "F_1_11",
    "F_1_12",
    "F_1_13",
    "F_1_14",
    "F_3_0",
    "F_3_1",
    "F_3_2",
    "F_3_3",
    "F_3_4",
    "F_3_5",
    "F_3_6",
    "F_3_7",
    "F_3_8",
    "F_3_9",
    "F_3_10",
    "F_3_11",
    "F_3_12",
    "F_3_13",
    "F_3_14",
    "F_3_15",
    "F_3_16",
    "F_3_17",
    "F_3_18",
    "F_3_19",
    "F_3_20",
    "F_3_21",
    "F_3_22",
    "F_3_23",
    "F_3_24",
    "F_4_0",
    "F_4_1",
    "F_4_2",
    "F_4_3",
    "F_4_4",
    "F_4_5",
    "F_4_6",
    "F_4_7",
    "F_4_8",
    "F_4_9",
    "F_4_10",
    "F_4_11",
    "F_4_12",
    "F_4_13",
    "F_4_14",
]

start = 3
if start == 3:
    for cl in trgs:
        # with dpctl.device_context("opencl:gpu"):
        with parallel_backend("threading", n_jobs=1):
            gc.collect()
            X_train, X_test, y_train, y_test = get_data_feed(cl)
            new_stack = gen_stack()
            gc.collect()

            new_stack.fit(X_train, y_train)
            yp = new_stack.predict(X_test)
            save_pipeline(cl, new_stack)
            print(mean_squared_error(yp, y_test))
        break


#
# 	LassoLarsIC 	BayesianRidge 	Lasso 	LassoLars 	TweedieRegressor 	RANSACRegressor 	ElasticNet
# 0 	1.003801   	1.003801 	1.003801 	1.003801 	1.003801 	1.004282 	1.003801
# 1 	1.000459 	1.000494 	1.000459 	1.000459 	1.000464 	1.002849 	1.000459
# 2 	0.998591 	0.998694 	0.998591 	0.998591 	0.998607 	0.999959 	0.998591

# In[ ]:


target_f = 0
if target_f == 9899:
    d_features = np.setdiff1d(np.arange(80), np.array(target_f))
    xf = dcl[:, d_features]
    yf = dcl[:, target_f]
    with dpctl.device_context("opencl:gpu"):
        with parallel_backend("multiprocessing"):
            X_train, X_test, y_train, y_test = train_test_split(xf, yf, test_size=0.5, random_state=0)
            # mod = xrb(
            #     n_estimators=100,
            #     learning_rate=0.001,
            #     booster="gbtree",
            #     n_jobs=-1,
            #     gamma=0.00001,
            #     random_state=0,
            #     importance_type="total_gain",
            #     num_parallel_tree=10,
            #     tree_method="approx",
            # )
            mod = HistGradientBoostingRegressor(max_leaf_nodes=900)
            work = make_pipeline(StandardScaler(), mod)
            y_pred = work.fit(X_train, y_train).predict(X_test)
            print(mean_squared_error(y_pred, y_test, squared=False))