In [4]:
def optimize_lgb_clf(X, y, seed, cv, metric, greater_is_better, num_boost_round, early_stopping_rounds, show,**my_scorer):
    from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, accuracy_score
    from sklearn.metrics import make_scorer
    
    '''
    оптимизирует гиперпараметры lightgbm с ранней остановкой и возможностью задать кастомную метрику качества
    
    параметры:
        1) X -признаки
        2) y - таргет
        3) seed - генератор случайных чисел
        4) cv - схема валидации
        5) metric - оптимизируемая метрика
        6) greater_is_better - направление улучшения значений метрики
        7) num_boost_round - ранняя остановка
        8) early_stopping_rounds - ранняя остановка
        9) show - показывать процесс оптимизации
        10) кастомная метрика (необязательный аргумент)
    
    '''
    
    
        
    ###############################################################################################################    
    def _fit_grid(X, y, seed, cv, metric, greater_is_better, grid, num_boost_round, early_stopping_rounds, show):
        if greater_is_better:   
            best_metric = 0
        else:
            best_metric = np.inf            
        for parameters in tqdm_notebook(ParameterGrid(grid)):
            estimator = LGBMClassifier(random_state = seed, **params)
            metrics = []
            try:
                for tr_idx, val_idx in cv.split(X):
                    data_tr = lgb.Dataset(X[tr_idx], label=y[tr_idx])
                    data_val = lgb.Dataset(X[val_idx], label=y[val_idx])
                    estimator.train(parameters,
                                    data_tr,
                                    valid_sets=data_val,
                                    num_boost_round=num_boost_round,
                                    early_stopping_rounds=early_stopping_rounds)
                    if metric =='roc_auc':
                        metrics.append(roc_auc_score(y[val_idx], estimator.predict_proba(data_val)[:, 1]))
                    if metric == 'f1':
                        metrics.append(f1_score(y[val_idx], estimator.predict(data_val)))
                    if metric == 'accuracy':
                        metrics.append(accuracy_score(y[val_idx], estimator.predict(data_val)))
                    if metric == 'precision':
                        metrics.append(precision_score(y[val_idx], estimator.predict(data_val)))
                    if metric == 'recall':
                        metrics.append(recall_score(y[val_idx], estimator.predict(data_val)))
                    if metric == 'custom':
                        metrics.append(**my_scorer(y[val_idx], estimator.predict(data_val)))
                    if metric == 'custom_prob':
                        metrics.append(**my_scorer(y[val_idx], estimator.predict_proba(data_val)[:, 1]))
                        
                mean_metric = np.mean(metrics)
                if greater_is_better:                    
                    if mean_metric>best_metric:
                        best_metric = mean_metric
                        if show:
                            print('best score = {}'.format(best_metric))
                        best_params = parameters
                else:
                    if mean_metric<best_metric:
                        best_metric = mean_metric
                        if show:
                            print('best score = {}'.format(best_metric))
                        best_params = parameters
                    
            except:
                for tr_idx, val_idx in cv.split(X, y):
                    data_tr = lgb.Dataset(X[tr_idx], label=y[tr_idx])
                    data_val = lgb.Dataset(X[val_idx], label=y[val_idx])
                    estimator.train(parameters,
                                    data_tr,
                                    valid_sets=data_val,
                                    num_boost_round=num_boost_round,
                                    early_stopping_rounds=early_stopping_rounds)
                    if metric =='roc_auc':
                        metrics.append(roc_auc_score(y[val_idx], estimator.predict_proba(data_val)[:, 1]))
                    if metric == 'f1':
                        metrics.append(f1_score(y[val_idx], estimator.predict(data_val)))
                    if metric == 'accuracy':
                        metrics.append(accuracy_score(y[val_idx], estimator.predict(data_val)))
                    if metric == 'precision':
                        metrics.append(precision_score(y[val_idx], estimator.predict(data_val)))
                    if metric == 'recall':
                        metrics.append(recall_score(y[val_idx], estimator.predict(data_val)))
                    if metric == 'custom':
                        metrics.append(**my_scorer(y[val_idx], estimator.predict(data_val)))
                    if metric == 'custom_prob':
                        metrics.append(**my_scorer(y[val_idx], estimator.predict_proba(data_val)[:, 1]))
                        
                mean_metric = np.mean(metrics)
                if greater_is_better:                    
                    if mean_metric>best_metric:
                        best_metric = mean_metric
                        if show:
                            print('best score = {}'.format(best_metric))
                        best_params = parameters
                else:
                    if mean_metric<best_metric:
                        best_metric = mean_metric
                        if show:
                            print('best score = {}'.format(best_metric))
                        best_params = parameters
                        
        return (best_metric, best_params)
    ##########################################################################################################################
    max_depths = np.arange(3, 11).tolist()
    num_leaves = [2**i for i in range(1, 10)]
    min_child_samples = [20, 50, 100, 500, 1000]
    
    grid1 = {'max_depth' : max_depths,\
            'num_leaves' : num_leaves,\
            'min_child_samples' : min_child_samples,\
            'n_estimators' : [500], 'learning_rate' :[.05]}
    
    best_metric, best_params = _fit_grid(X = X, y = y, seed = seed, cv = cv, metric = metric,\
                                         greater_is_better = greater_is_better,\
                                         grid = grid1, num_boost_round = num_boost_round,\
                                         early_stopping_rounds = early_stopping_rounds, show = show) 
    
    grid2 = {'max_depth' : [best_params['max_depth']-1, best_params['max_depth'], best_params['max_depth']+1],\
            'num_leaves' : [best_params['num_leaves']-8, best_params['num_leaves'], best_params['num_leaves']+8],\
            'min_child_samples' : [best_params['min_child_samples']-10, best_params['min_child_samples'],\
                                   best_params['min_child_samples']+10],\
            'n_estimators' : [500], 'learning_rate' :np.linspace(.01, .06, 10),\
            'subsample' : [.5, .7, 1], 'colsample_bytree' : [.5, .7, 1]}
    best_metric2, best_params2 = _fit_grid(X = X, y = y, seed = seed, cv = cv, metric = metric,\
                                         greater_is_better = greater_is_better,\
                                         grid = grid2, num_boost_round = num_boost_round,\
                                         early_stopping_rounds = early_stopping_rounds, show = show) 
    
    return (best_metric2, best_params2)
    