In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.feature_selection import VarianceThreshold, SelectFromModel
from sklearn.linear_model import LassoCV

# 加载数据

In [2]:
data = pd.read_csv('./data_processed/data_norm_log.csv', index_col=0)
data.fillna(data.mean(), inplace=True)

X = data.loc[:, data.columns != 'los']
y = data.loc[:, data.columns == 'los']

# 特征选择

In [3]:
# # 使用带L1正则化的逻辑回归进行特征选择
# lasso = LassoCV(cv=5, random_state=42).fit(X, y)

# selector = SelectFromModel(lasso, prefit=True)

# selected_features = selector.get_support(indices=True)

# X.columns[selected_features]

# X = X.iloc[:, selected_features]

# XGBoost

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror')

## Find Parameters

### GridSearchCV

In [6]:
# param_grid = {
#     'n_estimators': [50, 100, 200, 300, 400, 500, 700, 1000],
#     'learning_rate': [0.005, 0.01, 0.05, 0.1, 0.2, 0.3],
#     'max_depth': [3, 4, 6, 8, 10, 12, 15],
#     'colsample_bytree': [0.3, 0.5, 0.7, 0.9, 1.0],
#     'subsample': [0.5, 0.7, 0.9, 1.0],
#     'gamma': [0, 0.1, 0.2, 0.3, 0.4, 0.5],
#     'min_child_weight': [1, 3, 5, 7],
#     'reg_alpha': [0, 0.01, 0.1, 1, 10],
#     'reg_lambda': [10, 1, 0.01, 0.1, 0]
# }


# # 配置GridSearchCV
# grid_search = GridSearchCV(estimator=xg_reg, 
#                            param_grid=param_grid,
#                            scoring='r2', 
#                            cv=5, 
#                            verbose=3,
#                            random_state=42,
#                            n_jobs=-1)

# # 拟合模型
# grid_search.fit(X_train, y_train)

# # 查看最佳参数和最佳分数
# print(f"Best params: {grid_search.best_params_}")
# print(f"Best score: {grid_search.best_score_}")

### Randomized search on hyper parameters

In [7]:
param_distributions = {
    'n_estimators': [50, 100, 200, 300, 400, 500, 700, 1000],
    'learning_rate': [0.005, 0.01, 0.05, 0.1, 0.2, 0.3],
    'max_depth': [3, 4, 6, 8, 10, 12, 15],
    'colsample_bytree': [0.3, 0.5, 0.7, 0.9, 1.0],
    'subsample': [0.5, 0.7, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3, 0.4, 0.5],
    'min_child_weight': [1, 3, 5, 7],
    'reg_alpha': [0, 0.01, 0.1, 1, 10],
    'reg_lambda': [10, 1, 0.01, 0.1, 0]
}


# 配置GridSearchCV
random_search = RandomizedSearchCV(estimator=xg_reg, 
                                   param_distributions=param_distributions, 
                                   n_iter=100,
                                   scoring='r2', 
                                   cv=3, 
                                   verbose=3, 
                                   random_state=42, 
                                   n_jobs=6)


# 拟合模型
random_search.fit(X_train, y_train)

# 查看最佳参数和最佳分数
print(f"Best params: {random_search.best_params_}")
print(f"Best score: {random_search.best_score_}")

Fitting 3 folds for each of 100 candidates, totalling 300 fits
Best params: {'subsample': 0.9, 'reg_lambda': 10, 'reg_alpha': 10, 'n_estimators': 400, 'min_child_weight': 5, 'max_depth': 6, 'learning_rate': 0.05, 'gamma': 0.4, 'colsample_bytree': 0.7}
Best score: 0.0031041243322843157


In [8]:
final_model = xgb.XGBRegressor(**random_search.best_params_)

final_model.fit(X_train, y_train)

y_pred = final_model.predict(X_test)


print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 Score:", r2_score(y_test, y_pred))

RMSE: 0.5672752479627018
R2 Score: 0.009335038397020878


In [9]:
y_train_pred = final_model.predict(X_train)

r2_score(y_train, y_train_pred)

0.11527966815321622