In [1]:
# базовые библиотеки
from sklearn.model_selection import cross_validate
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
import warnings
warnings.filterwarnings("ignore") 
from tqdm import tqdm_notebook
from scipy.sparse import hstack, vstack, csc_matrix
import os, re, sys, gc, pickle, time
from collections import defaultdict
import joblib

# валидация, оптимизация гиперпараметров
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, cross_val_score,\
                                    KFold, train_test_split, cross_validate, ParameterGrid
from sklearn.base import BaseEstimator, TransformerMixin,  clone
# пайплайн
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin,  clone
from sklearn.metrics import make_scorer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, OneHotEncoder

# дамми-регрессор
from sklearn.dummy import DummyRegressor

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import LinearSVR
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
import time
from sklearn.feature_selection import SelectFromModel
import shutil

# нейронные сети 
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization
from keras.wrappers.scikit_learn import KerasRegressor
from keras.optimizers import Adam
from keras.metrics import RootMeanSquaredError
from tensorflow.keras.callbacks import EarlyStopping


from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import SelectFromModel
# вспомогательные модули
from bil_ml_tools import SklearnHelperMulticollinearityReducer, SklearnHelperRegressorValidator,\
                         SklearnHelperMetaFeaturesRegressor,\
                         SklearnHelperFeatureSelector, SklearnHelperTargetEncoder,\
                         SklearnHelperColumnSelector, SklearnHelperLabelEncoder,\
                         train_hold_test_split, convert_dtypes

In [2]:
# константы
SEED = 13
FILL_NA = -9999
# валидация
KF = KFold(3, random_state = SEED, shuffle = True)
# метрика качества
def NEG_RMSE_SCORING_FUNC(y_true, y_pred):
    return -np.sqrt(np.mean((y_true-y_pred)**2))
NEG_RMSE_SCORER = make_scorer(NEG_RMSE_SCORING_FUNC)

In [3]:
df = convert_dtypes(pd.read_csv('datasets/autos.csv'))
df_c = df.copy()
# добавляем признаки
for col in ['DateCrawled', 'LastSeen']:
    df_c[col+'.year'] = df[col].dt.year
    df_c[col+'.month'] =df[col].dt.month
    df_c[col+'.day'] =df[col].dt.day
    df_c[col+'.dayofweek'] =df[col].dt.dayofweek
    df_c[col+'.hour'] = df[col].dt.hour    
    df_c[col+'.minute'] = df[col].dt.minute    
    df_c[col+'.weekofyear'] =df[col].dt.weekofyear
    df_c[col+'.quarter'] =df[col].dt.quarter    
df_c['DateCreated.year'] = df['DateCreated'].dt.year
df_c['DateCreated.month'] = df['DateCreated'].dt.month
df_c['DateCreated.day'] = df['DateCreated'].dt.day
df_c = df_c.drop(['DateCreated', 'DateCrawled', 'LastSeen'], 1)

del df
gc.collect()
df_c = df_c.loc[:, df_c.nunique()!=1]
FEATURES, TARGET = df_c.drop('Price', 1), df_c['Price']

In [4]:
features_tr, features_ho, features_te, target_tr, target_ho, target_te = \
    train_hold_test_split(FEATURES, TARGET,
                          tr_size=.9,\
                          ho_size=.1,\
                          shuffle=True,\
                          random_state=SEED,\
                          stratify=False,\
                          use_test=True)

In [5]:
with open('X_lin_tr.pkl', 'rb') as f:
    X_lin_tr = pickle.load(f)    
with open('X_lin_ho.pkl', 'rb') as f:
    X_lin_ho = pickle.load(f)    
with open('X_lin_te.pkl', 'rb') as f:
    X_lin_te = pickle.load(f)

with open('X_tree_tr.pkl', 'rb') as f:
    X_tree_tr = pickle.load(f)    
with open('X_tree_ho.pkl', 'rb') as f:
    X_tree_ho = pickle.load(f)    
with open('X_tree_te.pkl', 'rb') as f:
    X_tree_te = pickle.load(f)

In [6]:
y_tr, y_ho, y_te = target_tr.values,target_ho.values, target_te.values

In [7]:
class SklearnHelperRegressorHPTuner(BaseEstimator, TransformerMixin):    
    def __init__(self, model, cv, scoring):
        self.model = model
        self.cv = cv
        self.scoring = scoring
    def info(self):
        pass
    def fit(self, X, y=None):
        assert (isinstance(X, np.ndarray)) or (X.getformat() == 'csc')
        best_estimator_ = clone(self.model)
        best_params = {}
        if type(self.model).__name__ == 'LGBMRegressor':     
            init_params = self.model.get_params()
            bp = {'n_estimators':init_params['n_estimators'],\
                  'random_state':init_params['random_state'],\
                  'n_jobs':init_params['n_jobs']}
            
            gs = GridSearchCV(best_estimator_,\
                              param_grid = {'n_estimators':[10], 'n_jobs':[-1], 'random_state':[SEED],\
                                            'max_depth':np.arange(4, 21).tolist(),\
                                            'num_leaves':[32, 64, 128, 256, 512, 1024],\
                                            'min_child_samples':[20, 50]},\
                              cv = self.cv,\
                              scoring=self.scoring,\
                              n_jobs=-1, verbose=1)
            gs.fit(X, y)
            best_params.update(gs.best_params_)
            best_estimator_ = best_estimator_.set_params(**best_params)
            
            gs = GridSearchCV(best_estimator_,\
                              param_grid = {'subsample':np.linspace(.1, 1, 10),\
                                            'colsample_bytree':np.linspace(.1, 1, 10)},\
                              cv = self.cv,\
                              scoring=self.scoring,\
                              n_jobs=-1, verbose=1)
            gs.fit(X, y)
            best_params.update(gs.best_params_)
            best_params['n_estimators'] = bp['n_estimators']
            best_params['random_state'] = bp['random_state']
            best_params['n_jobs'] = bp['n_jobs']
            best_estimator_ = best_estimator_.set_params(**best_params)
            
            learning_rates = [.005,.006, .007, .008, .009,\
                              .01, .02, .03, .04, .05, .06, .07, .08, .09,\
                              .1, .2, .3, .4, .5]
            best_score = -np.inf
            for lr in tqdm_notebook(learning_rates):
                best_params['learning_rate'] = lr
                lgb_curr = best_estimator_.set_params(**best_params)
                mean_cv_score = cross_val_score(lgb_curr, X, y, cv = self.cv, scoring = self.scoring).mean()
                if mean_cv_score>best_score:
                    best_score = mean_cv_score
                    best_lr = lr
                else:
                    break
            best_score = -np.inf
            for lr in tqdm_notebook(np.linspace(best_lr-.009,best_lr+.009, 50)):
                best_params['learning_rate'] = lr
                lgb_curr = best_estimator_.set_params(**best_params)
                mean_cv_score = cross_val_score(lgb_curr, X, y, cv = self.cv, scoring = self.scoring).mean()
                if mean_cv_score>best_score:
                    best_score = mean_cv_score
                    self.best_estimator_ = lgb_curr
                else:
                    break
            self.best_score_ =  best_score                   
        elif type(self.model).__name__ == 'XGBRegressor': 
            init_params = self.model.get_params()
            bp = {'n_estimators':init_params['n_estimators'],\
                  'random_state':init_params['random_state'],\
                  'n_jobs':init_params['n_jobs']}
            
            gs = GridSearchCV(best_estimator_,\
                              param_grid = {'n_estimators':[10], 'n_jobs':[-1], 'random_state':[SEED],\
                                            'max_depth':np.arange(4, 21).tolist(),\
                                            'min_child_weight':[20, 50]},\
                              cv = self.cv,\
                              scoring=self.scoring,\
                              n_jobs=-1, verbose=1)
            gs.fit(X, y)
            best_params.update(gs.best_params_)
            best_estimator_ = best_estimator_.set_params(**best_params)
            
            gs = GridSearchCV(best_estimator_,\
                              param_grid = {'subsample':[.5, .6, .7, .8, .9, 1],\
                                            'colsample_bytree':[.5, .6, .7, .8, .9, 1]},\
                              cv = self.cv,\
                              scoring=self.scoring,\
                              n_jobs=-1, verbose=1)
            gs.fit(X, y)
            best_params.update(gs.best_params_)
            best_params['n_estimators'] = bp['n_estimators']
            best_params['random_state'] = bp['random_state']
            best_params['n_jobs'] = bp['n_jobs']
            best_estimator_ = best_estimator_.set_params(**best_params)
            
            learning_rates = [.005,.006, .007, .008, .009,\
                              .01, .02, .03, .04, .05, .06, .07, .08, .09,\
                              .1, .2, .3, .4, .5]
            best_score = -np.inf
            for lr in tqdm_notebook(learning_rates):
                best_params['learning_rate'] = lr
                xgb_curr = best_estimator_.set_params(**best_params)
                mean_cv_score = cross_val_score(xgb_curr, X, y, cv = self.cv, scoring = self.scoring).mean()
                if mean_cv_score>best_score:
                    best_score = mean_cv_score
                    best_lr = lr
                else:
                    break
            best_score = -np.inf
            for lr in tqdm_notebook(np.linspace(best_lr-.009,best_lr+.009, 50)):
                best_params['learning_rate'] = lr
                xgb_curr = best_estimator_.set_params(**best_params)
                mean_cv_score = cross_val_score(xgb_curr, X, y, cv = self.cv, scoring = self.scoring).mean()
                if mean_cv_score>best_score:
                    best_score = mean_cv_score
                    self.best_estimator_ = xgb_curr
                else:
                    break
            self.best_score_ = best_score                    
        elif type(self.model).__name__ in ('DecisionTreeRegressor', 'ExtraTreeRegressor'):
            gs = GridSearchCV(best_estimator_,\
                              param_grid = {'max_depth':np.arange(7, 41), 'min_samples_leaf':[2, 20, 200]},\
                              cv = self.cv,\
                              scoring=self.scoring,\
                              n_jobs=-1, verbose=1)
            gs.fit(X, y)
            best_params.update(gs.best_params_)
            self.best_estimator_ = best_estimator_.set_params(**best_params)
            self.best_score_ = gs.best_score_
        elif type(self.model).__name__ in ('RandomForestRegressor', 'ExtraTreesRegressor'):
            init_params = self.model.get_params()
            bp = {'n_estimators':init_params['n_estimators'],\
                  'random_state':init_params['random_state'],\
                  'n_jobs':init_params['n_jobs']}
            gs = GridSearchCV(best_estimator_,\
                              param_grid = {'max_depth':np.arange(5, 21),'min_samples_leaf':[2, 20],\
                                            'n_estimators':[10], 'n_jobs':[-1], 'random_state':[bp['random_state']]},\
                              cv = self.cv,\
                              scoring=self.scoring,\
                              n_jobs=-1, verbose=1)
            gs.fit(X, y)
            best_params.update(gs.best_params_)
            best_params['n_estimators'] = bp['n_estimators']
            best_params['random_state'] = bp['random_state']
            best_params['n_jobs'] = bp['n_jobs']
            self.best_estimator_ = best_estimator_.set_params(**best_params)
            self.best_score_ = cross_val_score(self.best_estimator_,\
                                               X, y,\
                                               cv = self.cv, scoring=self.scoring, n_jobs=-1).mean()
        elif type(self.model).__name__ in ('Ridge', 'Lasso'):            
            gs = GridSearchCV(best_estimator_,\
                              param_grid = {'alpha':[.001, .002, .003, .004, .005,\
                                                     .01, .02, .03, .04, .05, .06, .07, .08, .09,\
                                                     .1, .2, .3, .4, .5, .6, .7, .8, .9,\
                                                     1, 2, 3, 4, 5],\
                                           'normalize':[True]},\
                              cv = self.cv,\
                              scoring=self.scoring,\
                              n_jobs=-1, verbose=1)
            gs.fit(X, y)
            self.best_estimator_ = gs.best_estimator_
            self.best_score_ = gs.best_score_
            
        elif type(self.model).__name__ == 'LinearSVR':
            gs = GridSearchCV(best_estimator_,\
                              param_grid = {'C':[.5, 1, 2, 4, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100,\
                                                 150, 200, 250, 300, 350, 400, 450, 500]},\
                              cv = self.cv,\
                              scoring=self.scoring,\
                              n_jobs=-1, verbose=1)
            gs.fit(X, y)
            self.best_estimator_ = gs.best_estimator_
            self.best_score_ = gs.best_score_
            
        elif type(self.model).__name__ == 'KNeighborsRegressor':
            gs = GridSearchCV(best_estimator_,\
                              param_grid = {'n_neighbors':range(2, 11)},\
                              cv = self.cv,\
                              scoring=self.scoring,\
                              n_jobs=-1, verbose=1)
            gs.fit(X, y)
            self.best_estimator_ = gs.best_estimator_
            self.best_score_ = gs.best_score_
            
        self.best_estimator_.fit(X, y) 
        try:
            self.coef_imp = self.best_estimator_.coef_.flatten()
        except:
            self.coef_imp = self.best_estimator_.feture_importances_.flatten()        
        return self
    def predict(self, X):
        return self.best_estimator_.predict(X)      
    
class SklearnHelperFeatureSelector(BaseEstimator, TransformerMixin):
    ''' Отбор признаков '''
    def __init__(self, model, cv, scoring, show_progress):
        self.model = model
        self.cv = cv
        self.scoring = scoring
        self.show_progress = show_progress
    def fit(self, X, y=None):
        assert (isinstance(X, np.ndarray)) or (X.getformat() == 'csc')
        cv_scores = []
        for i in tqdm_notebook(range(_X.shape[1])):
            try:
                _X_curr = _X[:, i].toarray().reshape(-1,1)
            except:
                _X_curr = _X[:, i].reshape(-1,1)                
            mean_cv_score = cross_val_score(self.model, _X_curr, y, cv =self.cv, scoring = self.scoring, n_jobs=-1).mean()            
            cv_scores.append(mean_cv_score)
            
        order = np.argsort(cv_scores)[::-1]
        to_drop_before, best_features, best_cv_score = [], [order[0]], -np.inf
        for i in tqdm_notebook(order[1:]):
            curr_features = best_features+[i]
            _X_curr = _X[:, curr_features]
            mean_cv_score = cross_val_score(self.model, _X_curr, y, cv =self.cv, scoring = self.scoring, n_jobs=-1).mean()
            if mean_cv_score>best_cv_score:
                best_cv_score = mean_cv_score
                best_features = curr_features
                if self.show_progress:
                    print('new best score = {:.5f}'.format(best_cv_score))
            else:
                to_drop_before.append(i)
        while True:
            to_drop_after = []
            for i in tqdm_notebook(to_drop_before):
                curr_features = best_features+[i]
                _X_curr = _X[:, curr_features]
                mean_cv_score = cross_val_score(self.model, _X_curr, y, cv =self.cv, scoring = self.scoring, n_jobs=-1).mean()
                if mean_cv_score>best_cv_score:
                    best_cv_score = mean_cv_score
                    best_features = curr_features
                    if self.show_progress:
                        print('new best score = {:.5f}'.format(best_cv_score))
                else:
                    to_drop_after.append(i)
            if to_drop_before == to_drop_after:
                break
            else:
                to_drop_before = to_drop_after  
        self.best_features = best_features
        self.best_cv_score = best_cv_score
    def transform(self, X):
        return _X[:, self.best_features]

In [8]:
ridge_tuner = SklearnHelperRegressorHPTuner(model = Ridge(random_state= SEED),\
                                            cv = KF,\
                                            scoring = NEG_RMSE_SCORER)
linearsvr_tuner = SklearnHelperRegressorHPTuner(model = LinearSVR(),\
                                            cv = KF,\
                                            scoring = NEG_RMSE_SCORER)
knn_tuner = SklearnHelperRegressorHPTuner(model = KNeighborsRegressor(),\
                                            cv = KF,\
                                            scoring = NEG_RMSE_SCORER)
dt_tuner = SklearnHelperRegressorHPTuner(model = DecisionTreeRegressor(),\
                                            cv = KF,\
                                            scoring = NEG_RMSE_SCORER)
et_tuner = SklearnHelperRegressorHPTuner(model = ExtraTreeRegressor(),\
                                            cv = KF,\
                                            scoring = NEG_RMSE_SCORER)
rf_tuner = SklearnHelperRegressorHPTuner(model = RandomForestRegressor(),\
                                            cv = KF,\
                                            scoring = NEG_RMSE_SCORER)
ets_tuner = SklearnHelperRegressorHPTuner(model = ExtraTreesRegressor(),\
                                            cv = KF,\
                                            scoring = NEG_RMSE_SCORER)
lgb_tuner = SklearnHelperRegressorHPTuner(model = LGBMRegressor(n_jobs=-1, random_state= SEED),\
                                        cv = KF,\
                                        scoring = NEG_RMSE_SCORER)
xgb_tuner = SklearnHelperRegressorHPTuner(model = XGBRegressor(n_jobs=-1, random_state= SEED),\
                                        cv = KF,\
                                        scoring = NEG_RMSE_SCORER)

# наименования моделей
L_best_estimators_names = ['Ridge','LinearSVR','KNeighborsRegressor','DecisionTree',\
                           'ExtraTree','RandomForest','ExtraTrees','Lightgbm','XGBoost']

# оптмизаторы гиперпараметров
L_hptuners = [ridge_tuner, linearsvr_tuner, knn_tuner, dt_tuner, et_tuner, rf_tuner, ets_tuner, lgb_tuner, xgb_tuner]

# признаки
L_base_X_tr = (X_lin_tr, X_lin_tr, X_tree_tr, X_tree_tr, X_tree_tr, X_tree_tr, X_tree_tr, X_tree_tr, X_tree_tr)
L_base_X_ho = (X_lin_ho, X_lin_ho, X_tree_ho, X_tree_ho, X_tree_ho, X_tree_ho, X_tree_ho, X_tree_ho, X_tree_ho)
L_base_X_te = (X_lin_te, X_lin_te, X_tree_te, X_tree_te, X_tree_te, X_tree_te, X_tree_te, X_tree_te, X_tree_te)

In [9]:
D_tuners = {}
L_cvAB = []
for name, tuner, X_tr, X_ho in tqdm_notebook(zip(L_best_estimators_names, L_hptuners, L_base_X_tr, L_base_X_ho),\
                                             total = len(L_best_estimators_names)):
    
    start = time.time()
    tuner.fit(X_tr, y_tr)  
    end = time.time()
    duration = round(end - start)
    
    D_tuners[name] = tuner
    
    mean_cv_score = tuner.best_score_
    ho_score = NEG_RMSE_SCORING_FUNC(y_ho, tuner.predict(X_ho))
    
    L_cvAB.append((name, mean_cv_score, ho_score, duration))
    
cvAB = pd.DataFrame(L_cvAB, columns = ['model', 'cv', 'ho', 'duration']).set_index('model').abs().sort_values('ho')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8.0), HTML(value='')))

Fitting 3 folds for each of 28 candidates, totalling 84 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    8.8s
[Parallel(n_jobs=-1)]: Done  84 out of  84 | elapsed:   14.6s finished


Fitting 3 folds for each of 23 candidates, totalling 69 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done  69 out of  69 | elapsed:   36.5s finished


Fitting 3 folds for each of 102 candidates, totalling 306 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   15.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 306 out of 306 | elapsed:  2.6min finished


Fitting 3 folds for each of 102 candidates, totalling 306 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   59.6s
[Parallel(n_jobs=-1)]: Done 306 out of 306 | elapsed:  1.7min finished


Fitting 3 folds for each of 32 candidates, totalling 96 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  96 out of  96 | elapsed:  4.5min finished


Fitting 3 folds for each of 32 candidates, totalling 96 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  96 out of  96 | elapsed:  3.9min finished


Fitting 3 folds for each of 204 candidates, totalling 612 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    8.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   43.1s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 612 out of 612 | elapsed:  2.8min finished


Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    8.9s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   54.7s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  1.7min finished


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=19.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))


Fitting 3 folds for each of 34 candidates, totalling 102 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   31.4s
[Parallel(n_jobs=-1)]: Done 102 out of 102 | elapsed:  3.1min finished


Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed:  4.1min finished


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=19.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))





In [17]:
L_values = [[1658.3696729058192, 1651.033588994316, 1840.0],
 [1659.3959979856463, 1652.6949683713735, 552.0],
 [1696.7469431322731, 1686.8034695407114, 529.0],
 [1731.746899524733, 1711.96235265943, 694.0],
 [1987.450608882015, 1968.9164993050645, 160.0],
 [2036.5300070397982, 2020.1216114481401, 105.0],
 [2104.8194806322613, 2131.1950747669393, 16.0],
 [2175.164966299662, 2200.292824822986, 52.0]]
L_columns = ['cv', 'ho', 'duration']
L_index = ['XGBoost', 'Lightgbm', 'ExtraTrees', 'RandomForest', 'DecisionTree',
       'ExtraTree', 'Ridge', 'LinearSVR']

In [18]:
cvAB = pd.DataFrame(L_values, columns = L_columns, index = L_index)

In [19]:
cvAB

Unnamed: 0,cv,ho,duration
XGBoost,1658.369673,1651.033589,1840.0
Lightgbm,1659.395998,1652.694968,552.0
ExtraTrees,1696.746943,1686.80347,529.0
RandomForest,1731.7469,1711.962353,694.0
DecisionTree,1987.450609,1968.916499,160.0
ExtraTree,2036.530007,2020.121611,105.0
Ridge,2104.819481,2131.195075,16.0
LinearSVR,2175.164966,2200.292825,52.0


In [174]:
class SklearnHelperStackingRegressor(BaseEstimator, TransformerMixin):
    def __init__(self, L_base_models, n_iterations, nfolds, seed, path_to_folder):
        self.L_base_models = L_base_models        
        self.n_iterations = n_iterations
        self.nfolds = nfolds
        self.seed = seed
        self.path_to_folder=path_to_folder
        
        if os.path.isdir(self.path_to_folder):
            shutil.rmtree(self.path_to_folder)
            os.makedirs(self.path_to_folder)
        else:
            os.makedirs(self.path_to_folder) 
            
    def fit(self, L_X, y=None):
        L_Z = []
        self.nrows = L_X[0].shape[0]
        print('classic stacking ...')
        # классический стекинг
        for i, (model, X) in tqdm_notebook(enumerate(zip(self.L_base_models, L_X)),\
                                              total = len(self.L_base_models)):
            current_seed=i+self.seed
            kf = KFold(self.nfolds, random_state= current_seed, shuffle = True)
            
            
            
            
            # пустые таблицы
            Z_tr = np.zeros((y.shape[0], 1))            

            # запускаем фолдинг
            for j, (tr_idx, val_idx) in tqdm_notebook(enumerate(kf.split(X, y)),\
                                                      total = self.nfolds):
                model.fit(X[tr_idx], y[tr_idx])
                
                filename = os.path.join(self.path_to_folder, f'model_{i+1}_{j+1}.pickle')
                with open(filename, 'wb') as f:
                    pickle.dump((model, None, i, j), f)
                    
                Z_tr[val_idx, 0] = model.predict(X[val_idx])                

            L_Z.append(Z_tr)
            
        self.Z = np.column_stack(L_Z)
        if self.n_iterations is not None:
            print('random model + features + oversampling stacking ...')
            indexes = np.arange(len(self.L_base_models))
            subsamples = [.5, .6, .7, .8, .9]
            L_Z = []
            # повторяем n_iterations раз
            for i in tqdm_notebook(range(self.n_iterations)):
                current_seed=i+self.seed
                # фиксируем сид
                np.random.RandomState(current_seed)

                # выбираем индекс
                current_idx = np.random.choice(indexes)
                # выбираем долю признаков, которые будут участвовать в обучении
                subsample = np.random.choice(subsamples)
                # фиксируем валидацию
                kf = KFold(self.nfolds, random_state= current_seed, shuffle = True)

                # получаем модель, признаки
                current_model = self.L_base_models[current_idx]
                _x_tr = L_X[current_idx]


                # размеры признаков
                nrows, ncols = _x_tr.shape

                # выбираем признаки, которые будут участвовать в обучении
                columns_to_use = np.random.choice(np.arange(ncols),\
                                               np.int32(np.around(ncols*subsample)),\
                                               replace = False)
                # получаем подпространства признаков
                _x_subset_tr = _x_tr[:, columns_to_use]

                del _x_tr
                gc.collect()

                # пустые таблицы
                Z = np.zeros((y.shape[0], 1))            

                # запускаем фолдинг
                for j, (tr_idx, val_idx) in tqdm_notebook(enumerate(kf.split(_x_subset_tr, y)),\
                                                          total = self.nfolds):

                    # тренировочная, валидационна выборки
                    _x_train = _x_subset_tr[tr_idx] 
                    _x_valid = _x_subset_tr[val_idx] 

                    # выбор строк из тренировочного датасета с возвращением
                    rows_to_use = np.random.choice(np.arange(_x_train.shape[0]), nrows, replace = True)

                    # обучаем модель
                    current_model.fit(_x_train[rows_to_use], y_tr[rows_to_use])

                    filename = os.path.join(self.path_to_folder, f'model2_{i+1}_{j+1}.pickle')
                    with open(filename, 'wb') as f:
                        pickle.dump((current_model, columns_to_use, i, j), f)

                    # получаем метапризнаки
                    Z[val_idx, 0] = current_model.predict(_x_valid)


                # коллекционируем
                L_Z.append(Z)

            self.Z2 = np.column_stack(L_Z)        
            self.Z_final = np.column_stack([self.Z, self.Z2])
            self.Z = self.Z_final
            del self.Z_final
            gc.collect()
        return self
    
    def predict(self, L_X):
        if self.nrows == L_X[0].shape[0]:
            return self.Z     
        else:            
            L = []
            for file in tqdm_notebook(os.listdir(self.path_to_folder)):    
                with open(os.path.join(self.path_to_folder, file), 'rb') as f:
                    model, columns_to_use, i, j =pickle.load(f)
                if columns_to_use is None:
                    L.append(model.predict(L_X[i]))
                else:
                    L.append(model.predict(L_X[i][:, columns_to_use]))
                    
            XX = np.column_stack(L)
            if self.n_iterations is not None:
                X_meta = np.column_stack([arr.mean(1) for arr in np.array_split(XX,\
                                                                                self.n_iterations*self.nfolds, axis = 1)])
            else:
                X_meta = np.column_stack([arr.mean(1) for arr in np.array_split(XX,self.nfolds, axis = 1)])
                
            
            return X_meta

In [175]:
L_base_models = [Ridge(), LinearSVR()]
L_base_X_tr = (X_lin_tr, X_lin_tr)
L_base_X_ho = (X_lin_ho, X_lin_ho)
L_base_X_te = (X_lin_te, X_lin_te)

In [176]:
stacking_reg = SklearnHelperStackingRegressor(L_base_models = L_base_models,\
                                              n_iterations=2,
                                              nfolds=5,\
                                              seed=SEED,\
                                              path_to_folder = r'C:\Users\Sergey\anaconda3\Scripts\stacking')

In [None]:
stacking_reg.fit(L_base_X_tr, y_tr)

classic stacking ...


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5.0), HTML(value='')))



random model + features + oversampling stacking ...


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5.0), HTML(value='')))

In [None]:
stacking_reg.predict(L_base_X_tr)

In [None]:
stacking_reg.predict(L_base_X_ho)

In [None]:
stacking_reg.predict(L_base_X_te)