### Вспомогательный функции

#### выбирает колонки для пайплайна

In [2]:
class SklearnHelperColumnSelector(BaseEstimator, TransformerMixin):
    '''выбирает колонки, отпавляемые в пайплайн'''
    def __init__(self, columns):
        self.columns = columns
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.columns]

#### факторизация категорий

In [None]:
class SklearnHelperLabelEncoder(TransformerMixin, BaseEstimator):
    ''' Факторизация категорий '''
    def __init__(self):
        pass
    def fit(self, X, y=None):
        X_c = X.astype(str)
        self.d1 = {}
        for col in X_c.columns:
            uniques = X_c[col].dropna().unique() 
            self.d1[col] =  dict(zip(uniques, range(len(uniques))))              
        return self
    def transform(self, X): 
        X_c = X.astype(str)
        for key, value in self.d1.items():
            X_c[key] = X_c[key].map(value)
        return X_c



#### кодирование категорий с помощью целевой переменной

In [None]:
class SklearnHelperTargetEncoder(BaseEstimator, TransformerMixin):
    '''
    Кодирование категорий с помощью целевой переменной
    
        1. разбиваем данные на фолды
        2. делаем ооф оценку
        3. повторяем n_iter раз        
    '''
    def __init__(self, n_iter, n_folds, min_samples_leaf, seed):
        self.n_iter = n_iter
        self.n_folds = n_folds
        self.min_samples_leaf = min_samples_leaf
        self.seed = seed
    def fit(self, X, y=None):
        self.y_mean = y.mean()
        _df_tr = pd.concat([X, y], 1)
        target_col = _df_tr.columns[-1]
        to_encode = _df_tr.columns[:-1]
        
        L_tr = []        
        self.L_d_encs = []
        for i in tqdm_notebook(range(self.n_iter)): 
            enc_tr = pd.DataFrame(index = _df_tr.index, columns = to_encode).fillna(0.0)
            for col in to_encode:
                for tr_idx, val_idx in KFold(self.n_folds, shuffle = True,random_state = self.seed+i)\
                                       .split(_df_tr):                    
                    grp = _df_tr.iloc[tr_idx].groupby(col)[target_col].agg({'mean', 'count'}) 
                    d_enc = grp[grp['count']>=self.min_samples_leaf]['mean'].to_dict()
                    self.L_d_encs.append((col, d_enc))
                    to_enc_tr =_df_tr.iloc[val_idx]                    
                    enc_tr.loc[to_enc_tr.index, col] = to_enc_tr[col].map(d_enc)                  
            L_tr.append(enc_tr)    
            
        self.enc_tr =  pd.concat(L_tr, 1)
        self._df_tr = _df_tr
        return self    
    def transform(self, X):
        if np.all(X.values == self._df_tr.values):
            return self.enc_tr.fillna(self.y_mean) 
        else:
            df_enc = pd.DataFrame(index = X.index, columns=X.columns).fillna(0.0)
            for feat, d in tqdm_notebook(self.L_d_encs):
                df_enc.loc[:, feat] += X[feat].map(d) / self.n_iter
            return df_enc.fillna(self.y_mean)

#### разделение данных на 3 части

In [None]:
def train_hold_test_split(features, target, tr_size, ho_size, shuffle, random_state, stratify, use_test):
    if use_test:
        # делим данные на тренировочную, отложенную, тестовую части
        features_trho, features_te, target_trho, target_te = train_test_split(\
                                                                       features, target,\
                                                                       train_size = tr_size,\
                                                                       shuffle = shuffle, random_state = random_state,\
                                                                       stratify = target if stratify else None)
        features_tr, features_ho, target_tr, target_ho = train_test_split(\
                                                                       features_trho, target_trho,\
                                                                       train_size = 1-ho_size,\
                                                                       shuffle = shuffle, random_state = random_state,\
                                                                       stratify = target_trho if stratify else None)
        return (features_tr, features_ho, features_te, target_tr, target_ho, target_te)
    else:
        # делим данные на тренировочную, отложенную, тестовую части
        features_tr, features_ho, target_tr, target_ho = train_test_split(\
                                                             features, target,\
                                                             train_size = tr_size,\
                                                             shuffle = shuffle, random_state = random_state,\
                                                             stratify = target if stratify else None)
        print('train size = {}, hold size ={}'\
              .format(features_tr.shape[0], features_ho.shape[0]))
        return (features_tr, features_ho, target_tr, target_ho)

#### Отбор признаков

In [None]:
class SklearnHelperFeatureSelector(BaseEstimator, TransformerMixin):
    '''
    Отбор признаков
    
        1. считаем валидацию каждого признака в отдельности
        2. рекурсивно добавляем признаки в порядке убывания индивидуального качества
        3. останавливаемся, когда ни один из признаков не был добавлен
    '''
    def __init__(self, model, cv, scoring, show_progress):
        self.model = model
        self.cv = cv
        self.scoring = scoring
        self.show_progress = show_progress
    def fit(self, X, y=None):
        #assert (isinstance(X, np.ndarray)) or (X.getformat() == 'csc')
        _X = X.copy()
        cv_scores = []
        for i in tqdm_notebook(range(_X.shape[1])):
            try:
                _X_curr = _X[:, i].toarray().reshape(-1,1)
            except:
                _X_curr = _X[:, i].reshape(-1,1)                
            mean_cv_score = cross_val_score(self.model, _X_curr, y, cv =self.cv, scoring = self.scoring, n_jobs=-1).mean()            
            cv_scores.append(mean_cv_score)
            
        order = np.argsort(cv_scores)[::-1]
        to_drop_before, best_features, best_cv_score = [], [order[0]], -np.inf
        for i in tqdm_notebook(order[1:]):
            curr_features = best_features+[i]
            _X_curr = _X[:, curr_features]
            mean_cv_score = cross_val_score(self.model, _X_curr, y, cv =self.cv, scoring = self.scoring, n_jobs=-1).mean()
            if mean_cv_score>best_cv_score:
                best_cv_score = mean_cv_score
                best_features = curr_features
                if self.show_progress:
                    print('new best score = {:.10f}'.format(best_cv_score))
            else:
                to_drop_before.append(i)
        while True:
            to_drop_after = []
            for i in tqdm_notebook(to_drop_before):
                curr_features = best_features+[i]
                _X_curr = _X[:, curr_features]
                mean_cv_score = cross_val_score(self.model, _X_curr, y, cv =self.cv, scoring = self.scoring, n_jobs=-1).mean()
                if mean_cv_score>best_cv_score:
                    best_cv_score = mean_cv_score
                    best_features = curr_features
                    if self.show_progress:
                        print('new best score = {:.10f}'.format(best_cv_score))
                else:
                    to_drop_after.append(i)
            if to_drop_before == to_drop_after:
                break
            else:
                to_drop_before = to_drop_after  
        self.best_features_ = best_features
        self.best_score_ = best_cv_score
    def transform(self, _X):
        return _X[:, self.best_features_]
    


#### оптимизация гиперпараметров

In [5]:
class SklearnHelperClassifierHPTuner(BaseEstimator, TransformerMixin):  
    '''
    Оптимизация гиперпараметров моделей
    '''
    def __init__(self, model, cv, scoring):
        self.model = model
        self.cv = cv
        self.scoring = scoring
    def info(self):
        pass
    def fit(self, X, y=None):
        #assert (isinstance(X, np.ndarray)) or (X.getformat() == 'csc')
        best_estimator_ = clone(self.model)
        best_params = {}
        if type(self.model).__name__ == 'LGBMClassifier':     
            init_params = self.model.get_params()
            bp = {'n_estimators':init_params['n_estimators'],\
                  'random_state':init_params['random_state'],\
                  'n_jobs':init_params['n_jobs']}
            
            gs = GridSearchCV(best_estimator_,\
                              param_grid = {'n_estimators':[10], 'n_jobs':[-1], 'random_state':[SEED],\
                                            'max_depth':np.arange(2, 21).tolist(),\
                                            'num_leaves':[32, 64, 128, 256, 512, 1024],\
                                            'min_child_samples':[20, 50]},\
                              cv = self.cv,\
                              scoring=self.scoring,\
                              n_jobs=-1, verbose=1)
            gs.fit(X, y)
            best_params.update(gs.best_params_)
            best_estimator_ = best_estimator_.set_params(**best_params)
            
            gs = GridSearchCV(best_estimator_,\
                              param_grid = {'subsample':np.linspace(.1, 1, 10),\
                                            'colsample_bytree':np.linspace(.1, 1, 10)},\
                              cv = self.cv,\
                              scoring=self.scoring,\
                              n_jobs=-1, verbose=1)
            gs.fit(X, y)
            best_params.update(gs.best_params_)
            best_params['n_estimators'] = bp['n_estimators']
            best_params['random_state'] = bp['random_state']
            best_params['n_jobs'] = bp['n_jobs']
            best_estimator_ = best_estimator_.set_params(**best_params)
            
            learning_rates = [.005,.006, .007, .008, .009,\
                              .01, .02, .03, .04, .05, .06, .07, .08, .09,\
                              .1, .2, .3, .4, .5]
            best_score = -np.inf
            for lr in tqdm_notebook(learning_rates):
                best_params['learning_rate'] = lr
                lgb_curr = best_estimator_.set_params(**best_params)
                mean_cv_score = cross_val_score(lgb_curr, X, y, cv = self.cv, scoring = self.scoring).mean()
                if mean_cv_score>best_score:
                    best_score = mean_cv_score
                    best_lr = lr
                else:
                    break
            best_params['learning_rate'] = best_lr
            self.best_estimator_ = best_estimator_.set_params(**best_params)
            self.best_score_ =  best_score                   
        elif type(self.model).__name__ == 'XGBClassifier': 
            init_params = best_estimator_.get_params()
            bp = {'n_estimators':init_params['n_estimators'],\
                  'random_state':init_params['random_state'],\
                  'n_jobs':init_params['n_jobs']}
            
            gs = GridSearchCV(best_estimator_,\
                              param_grid = {'n_estimators':[10], 'n_jobs':[-1], 'random_state':[SEED],\
                                            'max_depth':np.arange(2, 21).tolist(),\
                                            'min_child_weight':[20, 50]},\
                              cv = self.cv,\
                              scoring=self.scoring,\
                              n_jobs=-1, verbose=1)
            gs.fit(X, y)
            best_params.update(gs.best_params_)            
            best_estimator_ = best_estimator_.set_params(**best_params)
            
            gs = GridSearchCV(best_estimator_,\
                              param_grid = {'subsample':[.5, .6, .7, .8, .9, 1],\
                                            'colsample_bytree':[.5, .6, .7, .8, .9, 1]},\
                              cv = self.cv,\
                              scoring=self.scoring,\
                              n_jobs=-1, verbose=1)
            gs.fit(X, y)
            best_params.update(gs.best_params_)
            best_params['n_estimators'] = bp['n_estimators']
            best_params['random_state'] = bp['random_state']
            best_params['n_jobs'] = bp['n_jobs']
            best_estimator_ = best_estimator_.set_params(**best_params)            
            
            learning_rates = [.005,.006, .007, .008, .009,\
                              .01, .02, .03, .04, .05, .06, .07, .08, .09,\
                              .1, .2, .3, .4, .5]
            best_score = -np.inf
            for lr in tqdm_notebook(learning_rates):
                best_params['learning_rate'] = lr
                xgb_curr = best_estimator_.set_params(**best_params)
                mean_cv_score = cross_val_score(xgb_curr, X, y, cv = self.cv, scoring = self.scoring).mean()
                if mean_cv_score>best_score:
                    best_score = mean_cv_score
                    best_lr = lr                    
                else:
                    break            
            best_params['learning_rate'] = best_lr
            self.best_estimator_ = best_estimator_.set_params(**best_params)
            self.best_score_ =  best_score                 
        elif type(self.model).__name__ in ('DecisionTreeClassifier', 'ExtraTreeClassifier'):
            gs = GridSearchCV(best_estimator_,\
                              param_grid = {'max_depth':np.arange(7, 41), 'min_samples_leaf':[2, 20, 200]},\
                              cv = self.cv,\
                              scoring=self.scoring,\
                              n_jobs=-1, verbose=1)
            gs.fit(X, y)
            best_params.update(gs.best_params_)
            self.best_estimator_ = best_estimator_.set_params(**best_params)
            self.best_score_ = gs.best_score_
        elif type(self.model).__name__ in ('RandomForestClassifier', 'ExtraTreesClassifier'):
            init_params = self.model.get_params()
            bp = {'n_estimators':init_params['n_estimators'],\
                  'random_state':init_params['random_state'],\
                  'n_jobs':init_params['n_jobs']}
            gs = GridSearchCV(best_estimator_,\
                              param_grid = {'max_depth':np.arange(5, 21),'min_samples_leaf':[2, 20],\
                                            'n_estimators':[10], 'n_jobs':[-1], 'random_state':[bp['random_state']]},\
                              cv = self.cv,\
                              scoring=self.scoring,\
                              n_jobs=-1, verbose=1)
            gs.fit(X, y)
            best_params.update(gs.best_params_)
            best_params['n_estimators'] = bp['n_estimators']
            best_params['random_state'] = bp['random_state']
            best_params['n_jobs'] = bp['n_jobs']
            self.best_estimator_ = best_estimator_.set_params(**best_params)
            self.best_score_ = cross_val_score(self.best_estimator_,\
                                               X, y,\
                                               cv = self.cv, scoring=self.scoring, n_jobs=-1).mean()
        elif type(self.model).__name__ in ('LogisticRegression'):            
            gs = GridSearchCV(best_estimator_,\
                              param_grid = {'C':[.001, .002, .003, .004, .005,\
                                                     .01, .02, .03, .04, .05, .06, .07, .08, .09,\
                                                     .1, .2, .3, .4, .5, .6, .7, .8, .9,\
                                                     1, 2, 3, 4, 5]},\
                              cv = self.cv,\
                              scoring=self.scoring,\
                              n_jobs=-1, verbose=1)
            gs.fit(X, y)
            self.best_estimator_ = gs.best_estimator_
            self.best_score_ = gs.best_score_
            
        elif type(self.model).__name__ == 'LinearSVC':
            gs = GridSearchCV(best_estimator_,\
                              param_grid = {'C':[.5, 1, 2, 4, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100,\
                                                 150, 200, 250, 300, 350, 400, 450, 500]},\
                              cv = self.cv,\
                              scoring=self.scoring,\
                              n_jobs=-1, verbose=1)
            gs.fit(X, y)
            self.best_estimator_ = gs.best_estimator_
            self.best_score_ = gs.best_score_
            
        elif type(self.model).__name__ == 'KNeighborsClassifier':
            gs = GridSearchCV(best_estimator_,\
                              param_grid = {'n_neighbors':range(2, 11)},\
                              cv = self.cv,\
                              scoring=self.scoring,\
                              n_jobs=-1, verbose=1)
            gs.fit(X, y)
            self.best_estimator_ = gs.best_estimator_
            self.best_score_ = gs.best_score_
            
        self.best_estimator_.fit(X, y) 
        try:
            self.coef_imp_ = self.best_estimator_.coef_.flatten()
        except:
            try:
                self.coef_imp_ = self.best_estimator_.feature_importances_.flatten()        
            except:
                self.coef_imp_ = None                
        return self
    def predict(self, X):
        return self.best_estimator_.predict(X) 
    
class SklearnHelperStackingRegressor(BaseEstimator, TransformerMixin):
    ''' Классический стекинг моделей '''
    def __init__(self, L_base_models, nfolds, seed, path_to_folder):
        self.L_base_models = L_base_models
        self.nfolds = nfolds
        self.seed = seed
        self.path_to_folder=path_to_folder
        
        if os.path.isdir(self.path_to_folder):
            shutil.rmtree(self.path_to_folder)
            os.makedirs(self.path_to_folder)
        else:
            os.makedirs(self.path_to_folder) 
            
    def fit(self, L_X, y=None):
        L_Z = []
        self.nrows = L_X[0].shape[0] 
        self.y_mean = y.mean()
        # классический стекинг
        for i, (model, X) in tqdm_notebook(enumerate(zip(self.L_base_models, L_X)),\
                                              total = len(self.L_base_models)):
            current_seed=i+self.seed
            kf = KFold(self.nfolds, random_state= current_seed, shuffle = True)
            
            
            
            
            # пустые таблицы
            Z_tr = np.zeros((y.shape[0], 1))            

            # запускаем фолдинг
            for j, (tr_idx, val_idx) in tqdm_notebook(enumerate(kf.split(X, y)),\
                                                      total = self.nfolds):
                model.fit(X[tr_idx], y[tr_idx])
                
                filename = os.path.join(self.path_to_folder, f'model_{i+1}_{j+1}.pickle')
                with open(filename, 'wb') as f:
                    pickle.dump((model, i, j), f)
                    
                Z_tr[val_idx, 0] = model.predict(X[val_idx])                

            L_Z.append(Z_tr)
            
        self.Z = np.column_stack(L_Z)        
        return self
    
    def predict(self, L_X):
        if self.nrows == L_X[0].shape[0]:
            return self.Z     
        else:  
            folder_with_models = os.listdir(self.path_to_folder)
            L = []
            for file in tqdm_notebook(folder_with_models):    
                with open(os.path.join(self.path_to_folder, file), 'rb') as f:
                    model, i, j =pickle.load(f)
                L.append(model.predict(L_X[i])) 
            XX_meta = np.column_stack(L)
            X_meta = np.column_stack([arr.mean(1) for arr in np.array_split(XX_meta, len(self.L_base_models), axis = 1)])
            
            return X_meta