In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score


In [49]:
data = pd.read_csv('./data_processed/data.csv', index_col=0)

X = data.loc[:, data.columns != 'los']
y = data.loc[:, data.columns == 'los']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [50]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror')

In [52]:
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
    'max_depth': [3, 4, 6, 8, 10],
    'colsample_bytree': [0.3, 0.5, 0.7, 1.0],
    'subsample': [0.5, 0.7, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3, 0.4],
    'min_child_weight': [1, 3, 5, 7],
    'reg_alpha': [0, 0.01, 0.1, 1],
    'reg_lambda': [1, 0.01, 0.1, 0]
}

# 配置GridSearchCV
grid_search = GridSearchCV(estimator=xg_reg, 
                           param_grid=param_grid, 
                           scoring='r2', 
                           cv=5, 
                           verbose=1)

# 拟合模型
grid_search.fit(X_train, y_train)

# 查看最佳参数和最佳分数
print("最佳参数：", grid_search.best_params_)
print("最佳分数（负均方误差）：", grid_search.best_score_)

Fitting 5 folds for each of 480000 candidates, totalling 2400000 fits


In [None]:
# param_grid = {
#     'n_estimators': [100, 200, 300, 400, 500],
#     'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
#     'max_depth': [3, 4, 6, 8, 10],
#     'colsample_bytree': [0.3, 0.5, 0.7, 1.0],
#     'subsample': [0.5, 0.7, 1.0],
#     'gamma': [0, 0.1, 0.2, 0.3, 0.4],
#     'min_child_weight': [1, 3, 5, 7],
#     'reg_alpha': [0, 0.01, 0.1, 1],
#     'reg_lambda': [1, 0.01, 0.1, 0]
# }

# 设置参数字典
param_distributions = {
    'n_estimators': [100, 200, 300, 400, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
    'max_depth': [3, 4, 6, 8, 10],
    'colsample_bytree': [0.3, 0.5, 0.7, 1.0],
    'subsample': [0.5, 0.7, 1.0]
}

# 配置GridSearchCV
random_search = RandomizedSearchCV(estimator=xg_reg, param_distributions=param_distributions, n_iter=100, scoring='r2', cv=5, verbose=3, random_state=42, n_jobs=-1)


# 拟合模型
random_search.fit(X_train, y_train)

# 查看最佳参数和最佳分数
print("最佳参数：", random_search.best_params_)
print("最佳分数（R2）：", random_search.best_score_)