In [8]:
import pandas as pd
import numpy as np
from scipy import  stats
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm_notebook
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csc_matrix, hstack
from sklearn.model_selection import cross_validate, train_test_split, TimeSeriesSplit, cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin, clone
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import pickle
from sklearn.preprocessing import OneHotEncoder
from collections import defaultdict
from matplotlib_venn import venn2
import time
import os
from sklearn.model_selection import KFold

from sklearn.linear_model import Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import LinearSVR
from sklearn.tree import DecisionTreeRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

In [9]:
class StackingRegressor():
    def __init__(self, models, n_folds, seed):
        '''
        models - список с ансамблем моделей
        nfolds - число фолдов для ооф предсказаний
        seed - генератор случайных чисел
        '''        
        self.models = models
        self.n_folds = n_folds  
        self.seed=seed
    def fit(self, X, y):
        '''
        1) обучаем модели на валидации
        2) сохраняем обученные модели
        ''' 
        estimators = []
        for model in tqdm_notebook(self.models):
            for tr_idx,val_idx in KFold(self.n_folds,random_state= self.seed).split(y):                
                model.fit(X[tr_idx], y[tr_idx])
                estimators.append(model)                    
        self.fitted_estimators = estimators
    def get_metafeatures(self, X):
        '''
        с помощью обученных моделей получаем метапризнаки
        '''
        L = []    
        for estimator in tqdm_notebook(self.fitted_estimators):
            L.append(estimator.predict(X))
        return np.column_stack(L)

In [None]:
PATH_TO_FEATURES = r'C:\Users\Sergey\anaconda3\Scripts\alice\selection_results'

In [10]:
# признаки для логита, бустинга (трейн+отложенная)
logits_tr, logits_hold, lgbs_tr, lgbs_hold =[], [], [], []

for filename in os.listdir(PATH_TO_FEATURES):
    if ('x1' in filename) & ('TR' in filename):
        with open(os.path.join(PATH_TO_FEATURES, filename), 'rb') as f:
            logits_tr.append(pickle.load(f))
    elif ('x1' in filename) & ('HOLD' in filename):
        with open(os.path.join(PATH_TO_FEATURES, filename), 'rb') as f:
            logits_hold.append(pickle.load(f))
    
    elif ('x2' in filename) & ('TR' in filename):
        with open(os.path.join(PATH_TO_FEATURES, filename), 'rb') as f:
            lgbs_tr.append(pickle.load(f))
    elif ('x2' in filename) & ('HOLD' in filename):
        with open(os.path.join(PATH_TO_FEATURES, filename), 'rb') as f:
            lgbs_hold.append(pickle.load(f))
            
with open('target_TR', 'rb') as f:
    _y_TR = pickle.load(f)
    
with open('target_HOLD', 'rb') as f:
    _y_HOLD = pickle.load(f)

In [11]:
# генератор случайных чисел
SEED=13

# базовые модели
BASE_MODELS_LINEAR = [Lasso(random_state = SEED),\
                      Ridge(random_state = SEED),
                      KNeighborsRegressor(),\
                      LinearSVR(random_state = SEED)]
BASE_MODELS_TREE = [RandomForestRegressor(random_state = SEED),\
                    LGBMRegressor(random_state = SEED),\
                    XGBRegressor(random_state = SEED),\
                    KNeighborsRegressor(),\
                    DecisionTreeRegressor(random_state = SEED)]

In [12]:
print('число датасетов равно {}'.format(len(logits_tr)))

число датасетов равно 12


In [13]:
# гиперпараметры стекинга
N_FOLDS_STACKING = 5 # число фолдов в стекинге
N_ITERATIONS = 5 # число итераций стекинга
N_SUBSAMPLES = 5 # число итераций обучения базовых моделей
uniform_LOW, uniform_HIGH = .5, 1 # параметры распределения доли используемых признаков

In [None]:
# стекинг линейных моделей
_logit_TRs, _logit_HOLDs = [], []

# делаем стекинг N_ITERATIONS раз
for seed1 in tqdm_notebook(range(1, N_ITERATIONS+1)):
    
    # фиксируем валидационную схему для получения ооф предсказаний
    stacking_reg_linear = StackingRegressor(models = BASE_MODELS_LINEAR, n_folds=N_FOLDS_STACKING, seed = SEED+seed1)

    # списки с метапризнаками
    L_metas_TR, L_metas_HOLD = [], []
    
    # отобранные признаки для фолдов от 3 до 6
    for features_tr, features_hold in tqdm_notebook(zip(logits_tr, logits_hold), total = len(logits_tr)):
        
        # делаем подвыборки признаков N_SUBSAMPLES раз
        for seed2 in tqdm_notebook(range(1, N_SUBSAMPLES+1)):

            # генератор случайных чисел
            np.random.seed(SEED+seed1+seed2)

            # всего признаков
            nfeat_total = features_tr.shape[1]
            # доля используемых признаков (равномерная от .5 до 1)
            feat_share = np.random.uniform(uniform_LOW, uniform_HIGH)
            # число используемых признаков
            nfeat_to_select = np.int32(np.around(feat_share*nfeat_total))
            # индексы используемых признаков
            feat_idxs_subset = np.random.choice(np.arange(nfeat_total), nfeat_to_select, replace = False)

            # метапризнаки
            stacking_reg_linear.fit(features_tr[:, feat_idxs_subset], _y_TR)
            L_metas_TR.append(stacking_reg_linear.get_metafeatures(features_tr[:, feat_idxs_subset]))
            L_metas_HOLD.append(stacking_reg_linear.get_metafeatures(features_hold[:, feat_idxs_subset]))
            print('ok')
            
    _logit_TRs.append(L_metas_TR)
    _logit_HOLDs.append(L_metas_HOLD)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))

In [None]:
# стекинг деревьев
_lgb_TRs, _lgb_HOLDs = [], []

# делаем стекинг N_ITERATIONS раз
for seed1 in tqdm_notebook(range(1, N_ITERATIONS+1)):
    
    # фиксируем валидационную схему для получения ооф предсказаний
    stacking_reg_tree= StackingRegressor(models = BASE_MODELS_TREE, n_folds=N_FOLDS_STACKING, seed = SEED+seed1)

    # списки с метапризнаками
    L_metas_TR, L_metas_HOLD = [], []
    
    # отобранные признаки для фолдов от 3 до 6
    for features_tr, features_hold in tqdm_notebook(zip(lgbs_tr, lgbs_hold), total = len(lgbs_tr)):
        
        # делаем подвыборки признаков N_SUBSAMPLES раз
        for seed2 in tqdm_notebook(range(1, N_SUBSAMPLES+1)):

            # генератор случайных чисел
            np.random.seed(SEED+seed1+seed2)

            # всего признаков
            nfeat_total = features_tr.shape[1]
            # доля используемых признаков (равномерная от .5 до 1)
            feat_share = np.random.uniform(uniform_LOW, uniform_HIGH)
            # число используемых признаков
            nfeat_to_select = np.int32(np.around(feat_share*nfeat_total))
            # индексы используемых признаков
            feat_idxs_subset = np.random.choice(np.arange(nfeat_total), nfeat_to_select, replace = False)

            # метапризнаки
            stacking_reg_tree.fit(features_tr[:, feat_idxs_subset], _y_TR)
            L_metas_TR.append(stacking_reg_tree.get_metafeatures(features_tr[:, feat_idxs_subset]))
            L_metas_HOLD.append(stacking_reg_tree.get_metafeatures(features_hold[:, feat_idxs_subset]))
            
    _lgb_TRs.append(L_metas_TR)
    _lgb_HOLDs.append(L_metas_HOLD)