# LightGBM

算法介绍待添加...

## 调参

In [139]:
from sklearn.datasets import load_iris
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import log_loss, accuracy_score, f1_score, recall_score
import pandas as pd
import numpy as np
import bayes_opt
import matplotlib.pyplot as plt
import lightgbm as lgb
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

SEED = 42

def seed_everything():
    np.random.seed(42)

In [32]:
# 数据准备
data = load_iris()

X = pd.DataFrame(data['data'], columns=data['feature_names'])
y = data['target']

# 随机抽取50个样本作为测试数据

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=50)

In [242]:
class LightgbmOpt(object):
    
    def __init__(params, x, y, cv_fn, score_fn, train_args):
        """
        Args:
          params: lightgbm model params
          x: (DataFrame), features
          y: (ArrayLike), label
          cv_fn: (Function), 一个返回交叉验证索引的函数，如[(train_idx1, valid_idx1), ..., (train_idxn, valid_idxn)]
          train_args: lightGBM train keyargs
        """
        self._basic_params = params
        self._cv_fn = cv_fn
        self._score_fn = score_fn
        self._train_args = train_args
        self.x = x
        self.y = y
        
    def target_func(self, params):
        params = params_normalize(params)
        x, y = self.x, self.y
        cv_index = self._cv_fn(x, y)
        all_valid_idx = []
        if self.params['objective'] == 'regression':
            oof = np.zeros_like(y)
        elif self.params['objective'] == 'multiclass':
            oof = np.zeros([len(y), self.params['num_class']])
        for train_idx, valid_idx in cv_index:
            trainset = lgb.Dataset(x.iloc[train_idx], y[train_idx])
            validset = lgb.Dataset(x.iloc[valid_idx], y[valid_idx])
            estimator = lgb.train(params, train_set=trainset, valid_sets=[trainset, validset],**self._train_args)
            oof[valid_idx] = estimator.predict(x.iloc[valid_idx])
            all_valid_idx.append(valid_idx)
        all_valid_idx = np.vstack(all_valid_idx)
        score = self.score_fn(y[all_valid_idx], oof[all_valid_idx])
        return score

    
def params_normalize(cls, params):
    _int_params = [
    'num_iterations', 'num_leaves', 'max_depth', 'num_leaves', 
    'min_data_in_leaf', 'max_bin']
    for param, value in params.keys():
        if params in _int_params: 
            params[param] = round(value)
    return params


In [213]:
def cv_fn(x, y):
    folds = StratifiedKFold(5)
    cv_idx = []
    for train_idx, test_idx in folds.split(x, y):
        cv_idx.append([train_idx, test_idx])
    return cv_idx


params = {
        'task': 'train',
        'objective': 'multiclass',
        'boosting': 'gbdt',
        'metric': 'multi_logloss',
        'num_class': 3,
        'verbosity': 0,}

lgb_opt = LightgbmOpt(params, X_train, y_train, )

In [214]:
cv_score, oof, pred = cv_predict(X_train, y_train, params, x_test=X_test)

Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[32]	training's multi_logloss: 0.0755543	valid_1's multi_logloss: 0.23469
Training until validation scores don't improve for 30 rounds.
Did not meet early stopping. Best iteration is:
[100]	training's multi_logloss: 0.0255941	valid_1's multi_logloss: 0.0514279
Training until validation scores don't improve for 30 rounds.
Did not meet early stopping. Best iteration is:
[100]	training's multi_logloss: 0.0136673	valid_1's multi_logloss: 0.025656
Training until validation scores don't improve for 30 rounds.
Did not meet early stopping. Best iteration is:
[100]	training's multi_logloss: 0.00742261	valid_1's multi_logloss: 0.0458213
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[57]	training's multi_logloss: 0.0158185	valid_1's multi_logloss: 0.121004


In [220]:
res = pd.DataFrame(optimizer.res['all']['params'])
res['value'] = optimizer.res['all']['values']

In [221]:
res

Unnamed: 0,bagging_fraction,feature_fraction,lambda_l1,lambda_l2,max_depth,min_data_in_leaf,min_hessin_in_leaf,value
0,1.0,1.0,0.1,0.2,3.000001,20.0,0.01,-0.096215
1,0.996729,0.993493,0.046953,0.198564,3.058435,13.376306,0.004516,-0.099871
2,0.986037,0.988432,0.025568,0.005984,6.850578,16.172025,0.00569,-0.090332
3,0.982962,0.995673,0.093032,0.00536,4.50428,17.460805,0.000973,-0.0934
4,0.986815,0.892084,0.098328,0.057682,6.865908,14.80674,0.005773,-0.094212
5,0.97549,0.973174,0.006223,0.199875,5.622157,18.883853,0.006428,-0.088443
6,0.986283,0.976353,0.030518,0.184732,6.50281,17.281833,0.005549,-0.090617
7,0.997825,0.525128,0.001115,0.062921,6.078146,19.735966,0.002993,-0.100746
8,0.977844,0.991815,0.008404,0.103261,5.944004,15.543323,0.003109,-0.087909
9,0.999036,0.987769,0.039846,0.004522,4.824187,19.966264,0.007522,-0.089517


In [192]:
params_bounds = {
    'max_depth': (3, 7),
    'min_data_in_leaf': (1, 20),
    'min_hessin_in_leaf': (1e-4, 1e-2),
    'bagging_fraction': (0.5, 1.0),
    'feature_fraction': (0.5, 1.0),
    'lambda_l1': (0., 0.1),
    'lambda_l2': (0., 0.2),
}

In [193]:
optimizer = bayes_opt.BayesianOptimization(f=opt_func, pbounds=params_bounds, random_state=SEED)