In [1]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib as mlp
import matplotlib.pyplot as plt
import seaborn as sns
import time
import re, pip



In [2]:
import os  # 修改环境设置

# 算法/损失/评估指标等
import sklearn
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.model_selection import KFold, cross_validate

# 优化器
from bayes_opt import BayesianOptimization

import hyperopt
from hyperopt import hp, fmin, tpe, Trials, partial
from hyperopt.early_stop import no_progress_loss

import optuna

In [3]:
data = pd.read_csv('../data/House Price/train_encode.csv', index_col=0)
x = data.iloc[:, :-1]
y = data.iloc[:, -1]

# 基于Bayes_Opt实现GP优化

In [9]:
def bayesopt_object(n_estimators, max_depth, max_features, min_impurity_decrease):
    # 定义评估器
    # 需要调整的超参数等于目标函数的输入，不需要调整的超参数则直接等于固定值
    # 默认参数输入一定是浮点数，因此需要套上int函数处理成整数
    reg = RFR(
        n_estimators=int(n_estimators),
        max_depth=int(max_depth),
        max_features=int(max_features),
        min_impurity_decrease=min_impurity_decrease,
        random_state=1412,
        verbose=False,
        n_jobs=-1,
    )

    # 定义损失的输出，输出负均方根误差（-RMSE）
    # 注意，交叉验证需要使用数据，但我们不能让数据x,y成为目标函数的输入
    cv = KFold(n_splits=5, shuffle=True, random_state=1412)
    validation_loss = cross_validate(
        reg,
        x,
        y,
        scoring='neg_root_mean_squared_error',
        cv=cv,
        verbose=False,
        n_jobs=-1,
        error_score='raise',
        # 如果交叉验证中的算法执行报错，则告诉我们错误的理由
    )
    # 交叉验证输出的评估指标是负根均方误差，因此本来就是负的损失
    # 目标函数可直接输出该损失的均值
    return np.mean(validation_loss['test_score'])

In [5]:
param_grid_simple = {
    'n_estimators': (80, 100),
    'max_depth': (10, 25),
    'max_features': (10, 20),
    'min_impurity_decrease': (0, 1),
}

In [6]:
def param_bayes_opt(init_points, n_iter):
    # 定义优化器，先实例化优化器
    opt = BayesianOptimization(bayesopt_object, param_grid_simple, random_state=1412)
    opt.maximize(init_points=init_points, n_iter=n_iter)  # 抽取多少个初始观测值  # 一共观测/迭代多少次

    # 优化完成，取出最佳参数与最佳分数
    params_best = opt.max['params']
    score_best = opt.max['target']

    # 打印最佳参数与最佳分数
    print(
        "\n",
        "\n",
        "best params: ",
        params_best,
        "\n",
        "\n",
        "best cvscore: ",
        score_best,
    )

    return params_best, score_best

In [7]:
def bayes_opt_validation(params_best):
    reg = RFR(
        n_estimators=int(params_best['n_estimators']),
        max_depth=int(params_best['max_depth']),
        max_features=int(params_best['max_features']),
        min_impurity_decrease=params_best['min_impurity_decrease'],
        random_state=1412,
        verbose=False,
        n_jobs=-1,
    )

    cv = KFold(n_splits=5, shuffle=True, random_state=1412)
    validation_loss = cross_validate(
        reg,
        x,
        y,
        scoring='neg_root_mean_squared_error',
        cv=cv,
        verbose=False,
        n_jobs=-1,
    )
    return np.mean(validation_loss['test_score'])

In [10]:
start = time.time()
params_best, score_best = param_bayes_opt(20, 280)  # 初始看20个观测值，后面迭代280次
print(f'It takes {(time.time() - start) / 60} minutes')
validation_score = bayes_opt_validation(params_best)
print("\n", "\n", "validation_score: ", validation_score)

|   iter    |  target   | max_depth | max_fe... | min_im... | n_esti... |
-------------------------------------------------------------------------


| [0m1        [0m | [0m-2.948e+0[0m | [0m23.2     [0m | [0m17.52    [0m | [0m0.06379  [0m | [0m88.79    [0m |
| [95m2        [0m | [95m-2.909e+0[0m | [95m14.8     [0m | [95m17.61    [0m | [95m0.9214   [0m | [95m97.58    [0m |
| [95m3        [0m | [95m-2.9e+04 [0m | [95m15.86    [0m | [95m15.56    [0m | [95m0.2661   [0m | [95m87.98    [0m |
| [95m4        [0m | [95m-2.887e+0[0m | [95m14.05    [0m | [95m16.84    [0m | [95m0.06744  [0m | [95m89.72    [0m |
| [0m5        [0m | [0m-2.887e+0[0m | [0m18.71    [0m | [0m19.17    [0m | [0m0.9315   [0m | [0m83.7     [0m |
| [0m6        [0m | [0m-2.895e+0[0m | [0m17.7     [0m | [0m19.58    [0m | [0m0.7127   [0m | [0m89.18    [0m |
| [0m7        [0m | [0m-2.968e+0[0m | [0m14.21    [0m | [0m12.62    [0m | [0m0.3381   [0m | [0m91.51    [0m |
| [0m8        [0m | [0m-2.91e+04[0m | [0m23.23    [0m | [0m10.89    [0m | [0m0.6078   [0m | [0m95.06    [0m |
| [0m

# 基于HyperOpt实现TPE优化

In [4]:
def hyperopt_objective(params):
    # 定义评估器
    # 需要搜索的参数需要从输入的字典中索引出来
    # 不需要搜索的参数，可以是设置好的某个值
    # 在需要整数的参数前调整参数类型
    reg = RFR(
        n_estimators=int(params['n_estimators']),
        max_depth=int(params['max_depth']),
        max_features=int(params['max_features']),
        min_impurity_decrease=params['min_impurity_decrease'],
        random_state=1412,
        verbose=False,
        n_jobs=-1,
    )
    # 交叉验证
    cv = KFold(n_splits=5, shuffle=True, random_state=1412)
    validation_loss = cross_validate(
        reg,
        x,
        y,
        scoring='neg_root_mean_squared_error',
        cv=cv,
        verbose=False,
        n_jobs=-1,
        error_score='raise',
    )
    return np.mean(abs(validation_loss['test_score']))

In [5]:
param_grid_simple = {
    'n_estimators': hp.quniform('n_estimators', 80, 100, 1),
    'max_depth': hp.quniform('max_depth', 10, 25, 1),
    'max_features': hp.quniform("max_features", 10, 20, 1),
    'min_impurity_decrease': hp.quniform('min_impurity_decrease', 0, 5, 1),
}

In [6]:
def param_hyperopt(max_evals=100):
    # 保存迭代过程
    trials = Trials()

    # 设置提前停止
    early_stop_fn = no_progress_loss(100)

    # 定义代理模型
    params_best = fmin(
        hyperopt_objective,
        space=param_grid_simple,
        algo=tpe.suggest,  # 代理模型
        max_evals=max_evals,  # 允许的迭代次数
        verbose=True,
        trials=trials,
        early_stop_fn=early_stop_fn,
    )

    print('\n', '\n', "best params:", params_best, '\n')
    return params_best, trials

In [7]:
def hyperopt_validation(params):
    reg = RFR(
        n_estimators=int(params['n_estimators']),
        max_depth=int(params['max_depth']),
        max_features=int(params['max_features']),
        min_impurity_decrease=params['min_impurity_decrease'],
        random_state=1412,
        verbose=False,
        n_jobs=-1,
    )
    cv = KFold(n_splits=5, shuffle=True, random_state=1412)
    validation_loss = cross_validate(
        reg,
        x,
        y,
        scoring='neg_root_mean_squared_error',
        cv=cv,
        verbose=False,
        n_jobs=-1,
    )
    return np.mean(abs(validation_loss['test_score']))

In [8]:
params_best, trials = param_hyperopt(30)

100%|██████████| 30/30 [00:56<00:00,  1.89s/trial, best loss: 28696.954308211716]

 
 best params: {'max_depth': 12.0, 'max_features': 19.0, 'min_impurity_decrease': 2.0, 'n_estimators': 98.0} 



In [9]:
params_best, trials = param_hyperopt(100)

100%|██████████| 100/100 [02:29<00:00,  1.49s/trial, best loss: 28416.043696810833]

 
 best params: {'max_depth': 22.0, 'max_features': 14.0, 'min_impurity_decrease': 0.0, 'n_estimators': 96.0} 



In [10]:
params_best, trials = param_hyperopt(300)

 51%|█████     | 152/300 [03:29<03:24,  1.38s/trial, best loss: 28398.017337226145]

 
 best params: {'max_depth': 22.0, 'max_features': 14.0, 'min_impurity_decrease': 0.0, 'n_estimators': 83.0} 



In [11]:
hyperopt_validation(params_best)

28398.017337226145

In [12]:
trials.trials

[{'state': 2,
  'tid': 0,
  'spec': None,
  'result': {'loss': 29419.223485937313, 'status': 'ok'},
  'misc': {'tid': 0,
   'cmd': ('domain_attachment', 'FMinIter_Domain'),
   'workdir': None,
   'idxs': {'max_depth': [0],
    'max_features': [0],
    'min_impurity_decrease': [0],
    'n_estimators': [0]},
   'vals': {'max_depth': [20.0],
    'max_features': [13.0],
    'min_impurity_decrease': [1.0],
    'n_estimators': [86.0]}},
  'exp_key': None,
  'owner': None,
  'version': 0,
  'book_time': datetime.datetime(2023, 8, 5, 14, 40, 34, 182000),
  'refresh_time': datetime.datetime(2023, 8, 5, 14, 40, 35, 361000)},
 {'state': 2,
  'tid': 1,
  'spec': None,
  'result': {'loss': 29415.812458628225, 'status': 'ok'},
  'misc': {'tid': 1,
   'cmd': ('domain_attachment', 'FMinIter_Domain'),
   'workdir': None,
   'idxs': {'max_depth': [1],
    'max_features': [1],
    'min_impurity_decrease': [1],
    'n_estimators': [1]},
   'vals': {'max_depth': [11.0],
    'max_features': [17.0],
    'min

In [13]:
trials.losses()

[29419.223485937313,
 29415.812458628225,
 29294.16427342027,
 28911.670792589877,
 28923.60005746285,
 29593.3182144319,
 28876.526916334067,
 29418.01430490084,
 29070.76694628942,
 29411.826839318917,
 29086.55090886464,
 29703.751758255577,
 29721.99055857993,
 28830.978641494305,
 29038.418670638486,
 29544.16440918027,
 29287.719541128074,
 28922.537394973806,
 28703.360702044483,
 29390.074334885907,
 28878.94031928825,
 29050.914925016834,
 28620.359743952227,
 28878.65091675911,
 29014.77004831169,
 29487.87139034178,
 29205.341591011296,
 29298.428777430665,
 29465.527783688427,
 28998.938388193743,
 29146.797967424256,
 29303.809372073923,
 28969.44394209375,
 29409.424052264145,
 29575.651382505534,
 29076.184410879116,
 28965.972285599564,
 29127.866791238117,
 29210.218439446056,
 28826.892339775786,
 29677.76116436494,
 29030.6145005377,
 29243.29229179395,
 28541.893480783514,
 29553.184847428267,
 28566.203723907638,
 29104.323859412514,
 29099.26075123822,
 29865.8215

# 基于Optuna实现多种贝叶斯优化

In [4]:
def optuna_objective(trial):
    # 定义参数空间
    n_estimators = trial.suggest_int("n_estimators", 80, 100, 1)  # 整数型，(参数名称，下界，上界，步长)
    max_depth = trial.suggest_int("max_depth", 10, 25, 1)
    max_features = trial.suggest_int("max_features", 10, 20, 1)
    # max_features = trial.suggest_categorical("max_features",["log2","sqrt","auto"]) #字符型
    min_impurity_decrease = trial.suggest_int("min_impurity_decrease", 0, 5, 1)
    # min_impurity_decrease = trial.suggest_float("min_impurity_decrease",0,5,log=False) #浮点型

    # 定义评估器
    # 需要优化的参数由上述参数空间决定
    # 不需要优化的参数则直接填写具体值
    reg = RFR(
        n_estimators=n_estimators,
        max_depth=max_depth,
        max_features=max_features,
        min_impurity_decrease=min_impurity_decrease,
        random_state=1412,
        verbose=False,
        n_jobs=-1,
    )

    # 交叉验证过程，输出负均方根误差(-RMSE)
    # optuna同时支持最大化和最小化，因此如果输出-RMSE，则选择最大化
    # 如果选择输出RMSE，则选择最小化
    cv = KFold(n_splits=5, shuffle=True, random_state=1412)
    validation_loss = cross_validate(
        reg,
        x,
        y,
        scoring="neg_root_mean_squared_error",
        cv=cv,  # 交叉验证模式
        verbose=False,  # 是否打印进程
        n_jobs=-1,  # 线程数
        error_score='raise',
    )
    # 最终输出RMSE
    return np.mean(abs(validation_loss["test_score"]))

In [5]:
def optimizer_optuna(n_trials, algo):
    # 定义使用TPE或者GP
    if algo == 'TPE':
        algo = optuna.samplers.TPESampler(n_startup_trials=10, n_ei_candidates=24)
    elif algo == 'GP':
        from optuna.integration import SkoptSampler
        import skopt

        algo = SkoptSampler(
            skopt_kwargs={
                'base_estimator': 'GP',  # 选择高斯过程
                'n_initial_points': 10,  # 初始观测点10个
                'acq_func': 'EI',  # 选择的采集函数为EI，期望增量
            }
        )
    # 实际优化过程，首先实例优化器
    study = optuna.create_study(sampler=algo, direction='minimize')

    # 开始优化，n_trials为允许的最大迭代次数
    # 由于参数空间已经在目标函数中定义好，因此不需要输入参数空间
    study.optimize(
        optuna_objective,  # 目标函数
        n_trials=n_trials,  # 最大迭代次数（包括最初的观测值的）
        show_progress_bar=True,  # 要不要展示进度条呀？
    )

    # 可以直接从优化好的对象study中调用优化的结果
    # 打印最佳参数与最佳损失值
    print(
        "\n",
        "\n",
        "best params: ",
        study.best_trial.params,
        "\n",
        "\n",
        "best score: ",
        study.best_trial.values,
        "\n",
    )

    return study.best_trial.params, study.best_trial.values

In [6]:
import warnings
warnings.filterwarnings('ignore', message='The objective has been evaluated at this point before.')

In [7]:
best_params, best_score = optimizer_optuna(10, "GP")  # 默认打印迭代过程

[I 2023-08-06 20:17:50,823] A new study created in memory with name: no-name-d1b1a6c9-2ede-421d-8bf2-7439ce164700


  0%|          | 0/10 [00:00<?, ?it/s]

[I 2023-08-06 20:17:57,897] Trial 0 finished with value: 29125.40685413478 and parameters: {'n_estimators': 92, 'max_depth': 13, 'max_features': 13, 'min_impurity_decrease': 4}. Best is trial 0 with value: 29125.40685413478.
[I 2023-08-06 20:18:01,692] Trial 1 finished with value: 29670.426073132032 and parameters: {'n_estimators': 96, 'max_depth': 22, 'max_features': 12, 'min_impurity_decrease': 5}. Best is trial 0 with value: 29125.40685413478.
[I 2023-08-06 20:18:03,335] Trial 2 finished with value: 29035.387047404794 and parameters: {'n_estimators': 85, 'max_depth': 17, 'max_features': 20, 'min_impurity_decrease': 2}. Best is trial 2 with value: 29035.387047404794.
[I 2023-08-06 20:18:04,760] Trial 3 finished with value: 29683.967345799338 and parameters: {'n_estimators': 82, 'max_depth': 19, 'max_features': 12, 'min_impurity_decrease': 3}. Best is trial 2 with value: 29035.387047404794.
[I 2023-08-06 20:18:06,326] Trial 4 finished with value: 29388.331966901158 and parameters: {'n

In [8]:
optuna.logging.set_verbosity(optuna.logging.ERROR)  # 关闭自动打印的info，只显示进度条
# optuna.logging.set_verbosity(optuna.logging.INFO)
best_params, best_score = optimizer_optuna(300, "TPE")

  0%|          | 0/300 [00:00<?, ?it/s]


 
 best params:  {'n_estimators': 89, 'max_depth': 22, 'max_features': 14, 'min_impurity_decrease': 0} 
 
 best score:  [28346.672687223065] 



In [9]:
best_params, best_score = optimizer_optuna(300,"GP")

  0%|          | 0/300 [00:00<?, ?it/s]


 
 best params:  {'n_estimators': 100, 'max_depth': 22, 'max_features': 14, 'min_impurity_decrease': 0} 
 
 best score:  [28399.470801233365] 

