In [1]:
import pandas as pd
import numpy as np
from scipy import  stats
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm_notebook
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csc_matrix, hstack
from sklearn.model_selection import cross_validate, train_test_split, TimeSeriesSplit, cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin, clone
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import pickle
from sklearn.preprocessing import OneHotEncoder
from collections import defaultdict
from matplotlib_venn import venn2
import time
import os
from sklearn.model_selection import KFold

from sklearn.linear_model import Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import LinearSVR
from sklearn.tree import DecisionTreeRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

In [2]:
class FeatureSelector():
    def __init__(self, estimator,
                       metric,\
                       larger_is_better,\
                       cv,
                       use_values,\
                       use_recursion,
                       fill_na,\
                       show_progress, 
                       early_stopping = None):
        '''
        Инициализирует модель для отбора признаков
        
        Параметры:
            1) estimator - модель
            2) metric - метрика качества (названия метрик sklearn + может быть кастомная)
            3) larger_is_better - критерий оптимизации (чем больше, тем лучше)
            4) cv - схема валидации
            5) use_values - индексы столбцов, в которых требуется отобрать значения
            6) use_recursion - использовать рекурсию в отборе
            7) fill_na - значение, которым заполняются np.nan
            8) show_progress - печатать результаты валидации
            9) early_stopping - число итераций без улучшения метрики для ранней остановки отбора
        Возвращает:
            1) fit - производит отбор признаков
            2) transform - оставляет отобранные признаки
            3) return_self - возвращает 
                - best_features - отобранные признаки(список)
                - D_best_features - отобранные значения признаков (словарь: {признак:значения})
                - best_score - лучшее значение метрики
        '''
        self.estimator = estimator
        self.metric = metric
        self.cv = cv
        self.use_values = use_values        
        self.use_recursion = use_recursion
        self.show_progress = show_progress
        self.early_stopping = early_stopping
        self.fill_na = fill_na
        self.larger_is_better = larger_is_better
    def fit(self, X, Y):        
        flag = isinstance(X[:, 0], csc_matrix)
        # список с результатами валидации
        column_value_score = []
        # проходим по признакам
        for i in tqdm_notebook(range(X.shape[1])):
            # если формат матрицы признаков == csc_matrix
            if flag:
                # выбираем столбец, преобразуем
                ser = pd.DataFrame(X[:, i].todense())[0].values.flatten()
            # если формат != csc_matrix
            else:
                # выбираем столбец
                ser = X[:, i]        
            # если столбец в списке с проверкой значений 
            if self.use_values is not None:                
                if i in self.use_values:
                    # уникальные значения столбца
                    unique_values = np.unique(ser)  
                    # валидируем каждое значение
                    for val in unique_values:
                        _x = np.int32(ser==val).reshape(-1,1)
                        column_value_score.append((i, val,\
                                                   cross_val_score(self.estimator,\
                                                                   _x, Y,\
                                                                   scoring = self.metric,\
                                                                   cv = self.cv).mean()))
                else: 
                    # валидируем столбец
                    column_value_score.append((i, None,\
                                               cross_val_score(self.estimator,\
                                                               _x, Y,\
                                                               scoring = self.metric,\
                                                               cv = self.cv).mean()))
            else:
                # валидируем столбец
                    column_value_score.append((i, None,\
                                               cross_val_score(self.estimator,\
                                                               ser.reshape(-1,1), Y,\
                                                               scoring = self.metric,\
                                                               cv = self.cv).mean()))
                

        # признаки и значения признаков в порядке убывания валидации
        order = np.array(sorted(column_value_score, key = lambda x: x[-1], reverse = True))[:, :2]             
        # список лучших признаков
        best_features = []
        # словарь лучших значений признаков
        D_best_features = defaultdict(list)
        # список с признаками, не давшими прироста
        to_drop = []
        
        # лучшее значение метрики
        if self.larger_is_better:
            best_score = 0
        else:
            best_score = np.inf            
        counter = 0
        # проходим по признакам и значениям признаков в порядке убывания валидации
        for feature, feature_value in tqdm_notebook(order):   

            # добавляем текущие признаки/значения
            if feature_value is None:
                best_features.append(feature)               
            else:
                D_best_features[feature].append(feature_value)

            # обновляем матрицы
            L = []
            for k, v in D_best_features.items():
                if isinstance(X[:, k], csc_matrix):
                    L.append(pd.DataFrame(X[:, k].tocsc().todense())[0].apply(lambda x: x if x in v else self.fill_na))
                else:
                    L.append(pd.Series(X[:, k].flatten()).apply(lambda x: x if x in v else self.fill_na))

            if flag:
                if (len(best_features)>0) & (len(L)>0):
                    _X = csc_matrix(hstack([X[:, best_features], csc_matrix(np.column_stack(L)) ]))
                elif (len(best_features)==0) & (len(L)>0):
                    _X = csc_matrix(np.column_stack(L))
                elif (len(best_features)>0) & (len(L)==0):
                    _X = csc_matrix(X[:, best_features])                    
                        
            else:
                if (len(best_features)>0) & (len(L)>0):
                    _X = np.column_stack([X[:, best_features], np.column_stack(L)])
                elif (len(best_features)==0) & (len(L)>0):
                    _X = np.column_stack(L)
                elif (len(best_features)>0) & (len(L)==0):
                    _X = X[:, best_features] 
            # считаем валидацию    
            current_score = cross_val_score(self.estimator, _X, Y, scoring = self.metric, cv = self.cv).mean()
            # если метрика улучшилась
            if self.larger_is_better:
                if current_score>best_score:
                    # обновляем лучшую метрику
                    best_score = current_score
                    counter = 0
                    # печатаем 
                    if self.show_progress:
                        print('new best_score = {}'.format(best_score))
                # если метрика не улучшилась
                else: 
                    counter+=1
                    # удаляем признак/значение
                    if feature_value is None:
                        best_features = [val for val in best_features if val != feature]
                        to_drop.append((feature, None))
                    else:
                        D_best_features[feature] = [val for val in D_best_features[feature] if val != feature_value]    
                        to_drop.append((feature, feature_value))
                    if counter == self.early_stopping:
                        break
            else:
                if current_score<best_score:
                    # обновляем лучшую метрику
                    best_score = current_score
                    counter = 0
                    # печатаем 
                    if self.show_progress:
                        print('new best_score = {}'.format(best_score))
                    # если метрика не улучшилась
                else: 
                    counter+=1
                    # удаляем признак/значение
                    if feature_value is None:
                        best_features = [val for val in best_features if val != feature]
                        to_drop.append((feature, None))
                    else:
                        D_best_features[feature] = [val for val in D_best_features[feature] if val != feature_value]    
                        to_drop.append((feature, feature_value))
                    if counter == self.early_stopping:
                        break

        if self.use_recursion:
            # запускаем бесконечный цикл
            while True:
                # списки лучших признаков до и после
                to_drop_before = to_drop
                to_drop_after = []
                # проходим по признакам и значениям признаков в порядке убывания валидации
                for feature, feature_value in tqdm_notebook(to_drop_before):   
                    # добавляем текущие признаки/значения
                    if feature_value is None:
                        best_features.append(feature)               
                    else:
                        D_best_features[feature].append(feature_value)

                    # обновляем матрицы
                    L = []
                    for k, v in D_best_features.items():
                        if isinstance(X[:, k], csc_matrix):
                            L.append(pd.DataFrame(X[:, k].tocsc().todense())[0].apply(lambda x: x if x in v else self.fill_na))
                        else:
                            L.append(pd.Series(X[:, k].flatten()).apply(lambda x: x if x in v else self.fill_na))

                    if flag:
                        if (len(best_features)>0) & (len(L)>0):
                            _X = csc_matrix(hstack([X[:, best_features], csc_matrix(np.column_stack(L)) ]))
                        elif (len(best_features)==0) & (len(L)>0):
                            _X = csc_matrix(np.column_stack(L))
                        elif (len(best_features)>0) & (len(L)==0):
                            _X = csc_matrix(X[:, best_features])                    

                    else:
                        if (len(best_features)>0) & (len(L)>0):
                            _X = np.column_stack([X[:, best_features], np.column_stack(L)])
                        elif (len(best_features)==0) & (len(L)>0):
                            _X = np.column_stack(L)
                        elif (len(best_features)>0) & (len(L)==0):
                            _X = X[:, best_features] 

                    # считаем валидацию    
                    current_score = cross_val_score(self.estimator, _X, Y, scoring = self.metric, cv = self.cv).mean()
                    
                    
                    
                    
                    
                    
                    if self.larger_is_better:
                        if current_score>best_score:
                            # обновляем лучшую метрику
                            best_score = current_score
                            counter = 0
                            # печатаем 
                            if self.show_progress:
                                print('new best_score = {}'.format(best_score))
                            # если метрика не улучшилась
                        else: 
                            # удаляем признак/значение
                            if feature_value is None:
                                best_features = [val for val in best_features if val != feature]
                                to_drop_after.append((feature, None))
                            else:
                                D_best_features[feature] = [val for val in D_best_features[feature] if val != feature_value]    
                                to_drop_after.append((feature, feature_value))
                    else:
                        if current_score<best_score:
                            # обновляем лучшую метрику
                            best_score = current_score
                            counter = 0
                            # печатаем 
                            if self.show_progress:
                                print('new best_score = {}'.format(best_score))
                        else: 
                            # удаляем признак/значение
                            if feature_value is None:
                                best_features = [val for val in best_features if val != feature]
                                to_drop_after.append((feature, None))
                            else:
                                D_best_features[feature] = [val for val in D_best_features[feature] if val != feature_value]    
                                to_drop_after.append((feature, feature_value))
                    

                # если списки одинаковые, останавливаем отбор
                if len(to_drop_after) == len(to_drop_before):
                    break
                # если разные - обновляем списки до и после
                else:
                    to_drop_before = to_drop_after
                    to_drop_after = []
                    
        self.best_features = best_features
        self.D_best_features = D_best_features
        self.best_score =best_score
        self.flag = flag
    def transform(self, X):
              
        if len(self.best_features) !=0:
            x1 = X[:, self.best_features]
        else:
            x1 = None
        if len(list(self.D_best_features.keys())) !=0:
            L=[]
            for k, v in self.D_best_features.items():
                if self.flag:
                    L.append(pd.DataFrame(X[:, k].tocsc().todense())[0].apply(lambda x: x if x in v else self.fill_na))                    
                else:
                    L.append(pd.Series(X[:, k].flatten()).apply(lambda x: x if x in v else self.fill_na))
            x2 = np.column_stack(L)
        else:
            x2 = None
            
        if (x1 is not None) & (x2 is not None):
            if self.flag: 
                _X = csc_matrix(hstack([x1, x2]))
            else:
                _X = np.column_stack([x1, x2])
                
        if (x1 is not None) & (x2 is None):
            _X = x1
        if (x1 is None) & (x2 is not None):
            if self.flag:
                _X = csc_matrix(x2)
            else:
                _X = x2
        return _X     
        
    def return_self(self):
        return self


class StackingRegressor():
    def __init__(self, models, n_folds, seed):
        '''
        models - список с ансамблем моделей
        nfolds - число фолдов для ооф предсказаний
        seed - генератор случайных чисел
        '''        
        self.models = models
        self.n_folds = n_folds  
        self.seed=seed
    def fit(self, X, y):
        '''
        1) обучаем модели на валидации
        2) сохраняем обученные модели
        ''' 
        estimators = []
        for model in tqdm_notebook(self.models):
            for tr_idx,val_idx in KFold(self.n_folds,random_state= self.seed).split(y):                
                model.fit(X[tr_idx], y[tr_idx])
                estimators.append(model)                    
        self.fitted_estimators = estimators
    def get_metafeatures(self, X):
        '''
        с помощью обученных моделей получаем метапризнаки
        '''
        L = []    
        for estimator in tqdm_notebook(self.fitted_estimators):
            L.append(estimator.predict(X))
        return np.column_stack(L)

In [3]:
PATH_TO_FEATURES = r'C:\Users\Sergey\anaconda3\Scripts\alice\selection_results'

In [4]:
# признаки для логита, бустинга (трейн+отложенная)
logits_tr, logits_hold, lgbs_tr, lgbs_hold =[], [], [], []

for filename in os.listdir(PATH_TO_FEATURES):
    if ('x1' in filename) & ('TR' in filename):
        with open(os.path.join(PATH_TO_FEATURES, filename), 'rb') as f:
            logits_tr.append(pickle.load(f))
    elif ('x1' in filename) & ('HOLD' in filename):
        with open(os.path.join(PATH_TO_FEATURES, filename), 'rb') as f:
            logits_hold.append(pickle.load(f))
    
    elif ('x2' in filename) & ('TR' in filename):
        with open(os.path.join(PATH_TO_FEATURES, filename), 'rb') as f:
            lgbs_tr.append(pickle.load(f))
    elif ('x2' in filename) & ('HOLD' in filename):
        with open(os.path.join(PATH_TO_FEATURES, filename), 'rb') as f:
            lgbs_hold.append(pickle.load(f))
            
with open('target_TR', 'rb') as f:
    _y_TR = pickle.load(f)
    
with open('target_HOLD', 'rb') as f:
    _y_HOLD = pickle.load(f)

In [5]:
# генератор случайных чисел
SEED=13

# базовые модели
BASE_MODELS_LINEAR = [Lasso(random_state = SEED),\
                      Ridge(random_state = SEED),
                      KNeighborsRegressor(),\
                      LinearSVR(random_state = SEED)]
BASE_MODELS_TREE = [RandomForestRegressor(random_state = SEED),\
                    LGBMRegressor(random_state = SEED),\
                    XGBRegressor(random_state = SEED),\
                    KNeighborsRegressor(),\
                    DecisionTreeRegressor(random_state = SEED)]

In [6]:
print('число датасетов равно {}'.format(len(logits_tr)))

число датасетов равно 12


In [7]:
# гиперпараметры стекинга
N_FOLDS_STACKING = 5 # число фолдов в стекинге
N_ITERATIONS = 5 # число итераций стекинга
N_SUBSAMPLES = 5 # число итераций обучения базовых моделей
uniform_LOW, uniform_HIGH = .5, 1 # параметры распределения доли используемых признаков

In [None]:
# стекинг линейных моделей
_logit_TRs, _logit_HOLDs = [], []

# делаем стекинг N_ITERATIONS раз
for seed1 in tqdm_notebook(range(1, N_ITERATIONS+1)):
    
    # фиксируем валидационную схему для получения ооф предсказаний
    stacking_reg_linear = StackingRegressor(models = BASE_MODELS_LINEAR, n_folds=N_FOLDS_STACKING, seed = SEED+seed1)

    # списки с метапризнаками
    L_metas_TR, L_metas_HOLD = [], []
    
    # отобранные признаки для фолдов от 3 до 6
    for features_tr, features_hold in tqdm_notebook(zip(logits_tr, logits_hold), total = len(logits_tr)):
        
        # делаем подвыборки признаков N_SUBSAMPLES раз
        for seed2 in tqdm_notebook(range(1, N_SUBSAMPLES+1)):

            # генератор случайных чисел
            np.random.seed(SEED+seed1+seed2)

            # всего признаков
            nfeat_total = features_tr.shape[1]
            # доля используемых признаков (равномерная от .5 до 1)
            feat_share = np.random.uniform(uniform_LOW, uniform_HIGH)
            # число используемых признаков
            nfeat_to_select = np.int32(np.around(feat_share*nfeat_total))
            # индексы используемых признаков
            feat_idxs_subset = np.random.choice(np.arange(nfeat_total), nfeat_to_select, replace = False)
            
            __X_tr = np.array(features_tr.todense())
            __X_hold = np.array(features_hold.todense())
            # метапризнаки
            stacking_reg_linear.fit(__X_tr[:, feat_idxs_subset], _y_TR)
            L_metas_TR.append(stacking_reg_linear.get_metafeatures(__X_tr[:, feat_idxs_subset]))
            L_metas_HOLD.append(stacking_reg_linear.get_metafeatures(__X_hold[:, feat_idxs_subset]))
            
    _logit_TRs.append(L_metas_TR)
    _logit_HOLDs.append(L_metas_HOLD)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

In [None]:
# стекинг деревьев
_lgb_TRs, _lgb_HOLDs = [], []

# делаем стекинг N_ITERATIONS раз
for seed1 in tqdm_notebook(range(1, N_ITERATIONS+1)):
    
    # фиксируем валидационную схему для получения ооф предсказаний
    stacking_reg_tree= StackingRegressor(models = BASE_MODELS_TREE, n_folds=N_FOLDS_STACKING, seed = SEED+seed1)

    # списки с метапризнаками
    L_metas_TR, L_metas_HOLD = [], []
    
    # отобранные признаки для фолдов от 3 до 6
    for features_tr, features_hold in tqdm_notebook(zip(lgbs_tr, lgbs_hold), total = len(lgbs_tr)):
        
        # генератор случайных чисел
            np.random.seed(SEED+seed1+seed2)

            # всего признаков
            nfeat_total = features_tr.shape[1]
            # доля используемых признаков (равномерная от .5 до 1)
            feat_share = np.random.uniform(uniform_LOW, uniform_HIGH)
            # число используемых признаков
            nfeat_to_select = np.int32(np.around(feat_share*nfeat_total))
            # индексы используемых признаков
            feat_idxs_subset = np.random.choice(np.arange(nfeat_total), nfeat_to_select, replace = False)
            
            __X_tr = np.array(features_tr.todense())
            __X_hold = np.array(features_hold.todense())
            # метапризнаки
            stacking_reg_tree.fit(__X_tr[:, feat_idxs_subset], _y_TR)
            L_metas_TR.append(stacking_reg_tree.get_metafeatures(__X_tr[:, feat_idxs_subset]))
            L_metas_HOLD.append(stacking_reg_tree.get_metafeatures(__X_hold[:, feat_idxs_subset]))
            
    _lgb_TRs.append(L_metas_TR)
    _lgb_HOLDs.append(L_metas_HOLD)

In [None]:
X1_TR, X2_tr, X1_HOLD, X2_HOLD = np.column_stack(_logit_TRs), np.column_stack(_lgb_TRs),\
                                 np.column_stack(_logit_HOLDs), np.column_stack(_lgb_HOLDs)  

In [None]:
# получаем взаимодействия пар признаков

interactions1_TR, interactions1_HOLD = [], []
for i in range(X1_TR.shape[1]):
    for j in range(i+1,X1_TR.shape[1]):
        _x1tr, _x2tr = X1_TR[:, i], X1_TR[:, j]
        interactions1_TR.append((_x1tr+_x2tr) / 2)
        interactions1_TR.append(np.sqrt(_x1tr*_x2tr))
        
        _x1hold, _x2hold = X1_HOLD[:, i], X1_HOLD[:, j]
        interactions1_HOLD.append((_x1hold+_x2hold) / 2)
        interactions1_HOLD.append(np.sqrt(_x1hold*_x2hold))
        
interactions2_TR, interactions2_HOLD = [], []
for i in range(X2_TR.shape[1]):
    for j in range(i+1,X2_TR.shape[1]):
        _x1tr, _x2tr = X2_TR[:, i], X2_TR[:, j]
        interactions2_TR.append((_x1tr+_x2tr) / 2)
        interactions2_TR.append(np.sqrt(_x1tr*_x2tr))
        
        _x1hold, _x2hold = X2_HOLD[:, i], X2_HOLD[:, j]
        interactions2_HOLD.append((_x1hold+_x2hold) / 2)
        interactions2_HOLD.append(np.sqrt(_x1hold*_x2hold))

In [None]:
X1_final_TR, X2_final_tr, X1_final_HOLD, X2_final_HOLD =\
    np.column_stack(interactions1_TR), np.column_stack(interactions2_TR),\
    np.column_stack(interactions1_HOLD), np.column_stack(interactions2_HOLD)  

In [None]:
lgb_selector = FeatureSelector(estimator = LGBMRegressor(random_state = SEED),\
                                               metric = 'roc_auc',\
                                               larger_is_better = True,\
                                               cv = tscv,\
                                               use_values = None,\
                                               use_recursion = False,\
                                               fill_na = FILL_NA,\
                                               show_progress = False)
knn_selector = FeatureSelector(estimator = KNeighborsRegressor(),\
                                               metric = 'roc_auc',\
                                               larger_is_better = True,\
                                               cv = tscv,\
                                               use_values = None,\
                                               use_recursion = False,\
                                               fill_na = FILL_NA,\
                                               show_progress = False)
lasso_selector = FeatureSelector(estimator = Lasso(random_State =SEED),\
                                                 metric = 'roc_auc',\
                                                 larger_is_better = True,\
                                                 cv = tscv,\
                                                 use_values = None,\
                                                 use_recursion = False,\
                                                 fill_na = FILL_NA,\
                                                 show_progress = False)

ridge_selector = FeatureSelector(estimator = Ridge(random_State =SEED),\
                                                 metric = 'roc_auc',\
                                                 larger_is_better = True,\
                                                 cv = tscv,\
                                                 use_values = None,\
                                                 use_recursion = False,\
                                                 fill_na = FILL_NA,\
                                                 show_progress = False)

In [None]:
lgb_selector.fit(X1_final_TR, _y_TR)
X_lgb = np.row_stack([lgb_selector.transform(X1_final_TR),\
              lgb_selector.transform(X1_final_TE)])
lgb_best_score = lgb_selector.return_self().best_score

knn_selector.fit(X1_final_TR, _y_TR)
knn_best_score = knn_selector.return_self().best_score
X_knn = np.row_stack([knn_selector.transform(X1_final_TR)
                      knn_selector.transform(X1_final_TE)])
    
    
lasso_selector.fit(X1_final_TR, _y_TR)

lasso_best_score = lasso_selector.return_self().best_score
X_lasso = np.row_stack([lasso_selector.transform(X1_final_TR)
                        lasso_selector.transform(X1_final_TE)])


ridge_selector.fit(X1_final_TR, _y_TR)
ridge_best_score = ridge_selector.return_self().best_score

X_ridge = np.row_stack([ridge_selector.transform(X1_final_TR)
                          ridge_selector.transform(X1_final_TE)])

scores = pd.Series([lgb_best_score, knn_best_score, lasso_best_score, ridge_best_score])
weights = (scores / scores.sum()).values.flatten()
Xs = (X_lgb, X_knn, X_lasso, X_ridge)

In [None]:
L1, L2 = [], []
for Xtr, Xte, weight in zip(Xstr,Xste, weights):
    L1.append(logit_clf.fit(Xtr, _y_tr).predict_proba(Xte)[:, 1].flatten()*weights)
    L2.append(lgb_clf.fit(Xtr, _y_tr).predict_proba(Xte)[:, 1].flatten()*weights)
    
final_blend = np.sum(L1, 1)*.7 + .3 *np.sum(L2, 1)