In [4]:
import warnings
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')


rs = 132
dataset = pd.read_csv('./data/dataset.csv', dtype={"股票代码": "object"})
dataset.set_index('股票简称', inplace=True)

features = ['净资产收益率(%)', '资产报酬率(%)', '营业收入增长率(%)', '成本费用利润率(%)', '总资产周转率(%)', '应收账款周转率(%)', '存货周转率(%)', '营业周期', '技术人员占比(%)', '研发营收比(%)', '发明专利申请数', '数字化软硬件投入比(%)', '数字化战略导向前瞻性', '数字化战略导向持续性', '数字化战略导向广度', '数字化战略导向强度', '数字发明专利', '数字国家级奖项', '数字创新论文', '数字创新标准', '数字创新资质', '管理层数字职务设立', '两权分离率(%)', '数字人力计划投入', '科技创新基地建设', '员工人均营收比(%)', '提供岗位增长率(%)', '员工收入增长率(%)', '社会责任报告质量', '供应链合作伙伴', '数字化供应链覆盖度', '客户集中度(%)', '供应商集中度(%)']
label_name = 'score'
unit_map = dataset[['股票代码', '行业代码', '所属省份']].to_dict()
# 获取数据集和标签值
y : pd.Series = dataset[label_name]
X : pd.DataFrame = dataset[features].copy(deep=True).astype("float")
# 数据预处理：1.极差标准化；2.数据集划分。
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=rs)
scaler = MinMaxScaler()
scaler.fit(X_train)

X_train_s = scaler.transform(X_train)
X_test_s = scaler.transform(X_test)

In [5]:
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import KFold, GridSearchCV, cross_val_score
plt.rcParams['axes.unicode_minus'] = False
# 一般机器学习模型的训练与优化

# 1.最小二乘线性回归
model = LinearRegression(n_jobs=-1)
model.fit(X_train_s, y_train)
y_pred = model.predict(X_test_s)
print(f"OLS Regression | r2:{r2_score(y_test, y_pred)} | mean_squared_error:{mean_squared_error(y_test, y_pred):.2} | mean_absolute_error:{mean_absolute_error(y_test, y_pred):.2}。\n 因子分析的结果实际是对原结果进行线性转化的过程，因此最小二乘法能够较为精确的计算出实际权重。")

OLS Regression | r2:1.0 | mean_squared_error:3.9e-33 | mean_absolute_error:4.8e-17。
 因子分析的结果实际是对原结果进行线性转化的过程，因此最小二乘法能够较为精确的计算出实际权重。


In [None]:
import numpy as np




In [None]:
from sklearn.tree import DecisionTreeRegressor

# 基于树模型的参数优化基本流程为：首先计算单个参数的大致范围，随后进行网格搜索。
# 决策树算法优化思路:

# 1.优化ccp_alpha(网格搜索)；

# model = DecisionTreeRegressor(random_state=rs)
# path = model.cost_complexity_pruning_path(X_train_s, y_train)
# param_grid = {'ccp_alpha': path.ccp_alphas}
# kfold = KFold(n_splits=10, shuffle=True, random_state=1)
# model = GridSearchCV(DecisionTreeRegressor(random_state=rs), param_grid, cv=kfold)
# model.fit(X_train_s, y_train)
# print(model.best_params_)

# 2.单参数优化；
model = DecisionTreeRegressor(max_features="sqrt", random_state=1)
path = model.cost_complexity_pruning_path(X_train_s, y_train)
param_grid = {
    'ccp_alpha': path.ccp_alphas,  # 剪枝参数
    'max_depth': np.arange(5, 15, 1),  # 决策树最大深度，用来防止过拟合; 
    'min_samples_split': np.arange(1, 3, 1),  # 分裂节点所需的最小样本数，也就是如果样本数小于这个值就不划分了。
    'min_samples_leaf': np.arange(1, 4, 1)  # 叶节点所需的最小样本数，如果样本数小于这个，就不划分了。用来防止过拟合
}
# 2.网格法参数微调
kfold = KFold(n_splits=10, shuffle=True, random_state=rs)
model = GridSearchCV(DecisionTreeRegressor(max_features="sqrt", random_state=rs), param_grid, cv=kfold, n_jobs=-1)
model.fit(X_train_s, y_train)

# 决策树最优超参数：{'ccp_alpha': 0.7814179316082759, 'max_depth': 11, 'min_samples_leaf': 1, 'min_samples_split': 2} ；综合得分0.7849455038096266
print(f"决策树最优超参数：{model.best_params_}\n 综合得分{model.best_estimator_.score(X_test_s, y_test)}")

In [6]:
from sklearn.ensemble import RandomForestRegressor
# 随机森林参数优化：https://www.jianshu.com/p/f5b45a60289f
param_grid = {
    'max_depth': np.arange(2, 30, 1),  # 树的最大深度
    'n_estimators': np.arange(2, 30, 1),  # 森林中树的数量
    'min_samples_split': [2, 3, 4, 5],  # 分裂节点所需的最小样本数
    'min_samples_leaf': [1, 2, 3],  # 叶节点所需的最小样本数
    'max_features': np.arange(2, 30, 1)  # 每次分裂时考虑的特征数量
}

model = RandomForestRegressor(verbose=0, random_state=rs, min_samples_split=2, min_samples_leaf=2)
kfold = KFold(n_splits=10, shuffle=True, random_state=rs)
model = GridSearchCV(model, param_grid, cv=kfold, scoring='r2', verbose=0, n_jobs=-1)
model.fit(X_train_s, y_train)
# RandomForest 最优超参数：{'max_depth': 13, 'max_features': 11, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 7}
 # 综合得分0.8483375813154402

print(f"RandomForest 最优超参数：{model.best_params_}\n 综合得分{model.best_estimator_.score(X_test_s, y_test)}")

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x106dc1910>>
Traceback (most recent call last):
  File "/Users/xieheng/projects/envs/digital/lib/python3.9/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


KeyboardInterrupt: 

In [None]:
from xgboost import XGBRegressor

# 3.xgboost参数优化：https://www.cnblogs.com/showmeai/p/16037327.html；
params = {
   "objective": 'reg:squarederror',
   "n_estimators": 10,
   "max_depth": 9,
   "learning_rate": 0.24,
   "subsample": 0.2,
   "min_child_weight": 1,
   "gamma": 0.1,
   "colsample_bytree": 0.8,
   "reg_alpha": 0,
   "reg_lambda": 0,
   # "scale_pos_weight": 1,
}
param_grid = {
    'n_estimators': np.arange(18, 24, 1),
    'max_depth': np.arange(8, 12, 1),
    'learning_rate': np.arange(0.15, 0.25, 0.01), # [0.05, 0.1, 0.2]
    "min_child_weight": [4, 5, 6, 7],
    # "gamma": [0, 0.1],
    'subsample': np.arange(0.4, 0.7, 0.1),
    'colsample_bytree': np.arange(0.95, 1.2, 0.01),
}

model = XGBRegressor(random_state=rs)
kfold = KFold(n_splits=10, shuffle=True, random_state=rs)
model = GridSearchCV(model, param_grid, cv=kfold, scoring='r2', verbose=0, n_jobs=-1)
model.fit(X_train_s, y_train)  
print(f"XGBRegressor 最优超参数：{model.best_params_}\n 综合得分{model.best_estimator_.score(X_test_s, y_test)}")

# XGBRegressor 最优超参数：{'colsample_bytree': 0.9800000000000001, 'gamma': 0.1, 'learning_rate': 0.2, 'max_depth': 11, 'min_child_weight': 5, 'n_estimators': 23, 'subsample': 0.5}   综合得分0.8801265955806522

In [None]:
from catboost import CatBoostRegressor

# 4.catboost参数优化
param_grid = {
    # 'iterations': np.arange(100, 401, 50),
    'depth': [2, 3, 4, 5], 
    'learning_rate': np.arange(0.14, 0.38, 0.01),
    'l2_leaf_reg': np.arange(0.2, 1.6, 0.1),
    # 'bagging_temperature': [0.1, 0.3],
    # 'border_count': np.arange(35, 190, 1)
}
# 分布计算
model = CatBoostRegressor(random_state=rs, verbose=0, train_dir=None, allow_writing_files=False, iterations=400, bagging_temperature=0.1, border_count=182)
kfold = KFold(n_splits=10, shuffle=True, random_state=rs)
model = GridSearchCV(model, param_grid, cv=kfold, scoring='r2', verbose=0, n_jobs=-1)
model.fit(X_train_s, y_train) 
print(f"CatBoost 最优超参数：{model.best_params_}\n 综合得分{model.best_estimator_.score(X_test_s, y_test)}")

# CatBoost 最优超参数：{'depth': 2, 'l2_leaf_reg': 0.6000000000000001, 'learning_rate': 0.15000000000000002} 综合得分0.9445921278676876

In [None]:
from lightgbm import LGBMRegressor

params = {
    "objective": 'mse',
    'max_depth': 3,
    "n_estimators": 200,
    "learning_rate": 0.1,
    'min_child_samples': 7,
    'reg_alpha': 0,
    'reg_lambda': 0,
    "force_col_wise": True,
    "subsample": 0.8,
    'colsample_bytree': 0.26,
    "num_leaves": 7  # 一般设置为(0, 2^max_depth - 1]的一个数值。是一个需要重点调节的参数，对模型性能影响很大。
}
param_grid = {
    'max_depth': [2, 3, 4, 5],
    'learning_rate': [0.06, 0.1, 0.12, 0.17, 0.23, 0.24, 0.25, 0.27],
    # "n_estimators": 100,
    'min_child_samples': [5, 6, 7, 8], # 6附近
    'colsample_bytree': np.arange(0.25, 0.45, 0.01),  # 0.25和0.45附近
    "num_leaves": np.arange(3, 8, 1)
}
model = LGBMRegressor(n_jobs=-1, random_state=rs, verbosity=-1, **params)
kfold = KFold(n_splits=10, shuffle=True, random_state=rs)
model = GridSearchCV(model, param_grid, cv=kfold, scoring='r2', verbose=0, n_jobs=-1)
model.fit(X_train_s, y_train) 
print(f"LGBMBoost 最优超参数：{model.best_params_}\n 综合得分{model.best_estimator_.score(X_test_s, y_test)}")

In [None]:
from lightgbm import LGBMRegressor
params = {
    "objective": 'mse',
    'max_depth': 3,
    "n_estimators": 200,
    'learning_rate': 0.15,
    'min_child_samples': 7, 
    'reg_alpha': 0,
    'reg_lambda': 0,
    "force_col_wise": True,
    "subsample": 0.8,
    'colsample_bytree': 0.32,
    "num_leaves": 6
}
model = LGBMRegressor(n_jobs=-1, random_state=rs, verbosity=-1, **params)
model.fit(X_train_s, y_train)
print(model.score(X_test_s, y_test))

In [None]:
from tensorflow import keras
from utils.methods import r2

# Sequential 模型适用于普通层堆栈 其中，每层只有一个 input Tensor 和一个 Output Tensor。
model = keras.models.Sequential()
# model.add(keras.Input(shape=(X_train_s.shape[1], )))
model.add(keras.layers.Dense(units=30, activation='relu', name="layer1", input_shape=(X_train_s.shape[1], ),
                               kernel_regularizer=keras.regularizers.l2(0.02)))
model.add(keras.layers.Dropout(0.01))

# 增加输出层
model.add(keras.layers.Dense(units=1, name="output"))
optimizer = keras.optimizers.Adam(learning_rate=0.02)
model.compile(optimizer=optimizer, loss='mse', metrics=['mse', r2, 'mae'])
# 查看模型结构
# utils.plot_model(model, "./assert/feature_importance/bp_model_structure.png", show_shapes=True)
# model.summary()
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
hist = model.fit(X_train_s, y_train, validation_split=0.2, epochs=300, batch_size=30, shuffle=False, verbose=0,
                         callbacks=[early_stopping])
# validation_split是训练集验证集拆分，epochs代表训练300轮，batch_size代表在批量梯度下降时每次选择16个样本，shuffle代表在训练过程中不会将数据反复打乱
# verbose：日志显示，0为不在标准输出流输出日志信息，1为输出进度条记录，2为每个epoch输出一行记录。

# s = model.evaluate(X_test_s, y_test, verbose=0)
fig, ax = plt.subplots(figsize=(8, 6), dpi=100)
# 绘制训练集验证集损失变化
"""
训练过程中的损失函数变化情况
模型在训练集上的损失远小于验证集的损失，说明模型过度拟合，但总体准确度已经较为准确。
在训练的后期（大约150个epoch之后），验证集MSE下降速度很慢。这可能表明模型容量过大或正则化不足，可以适当增加正则化系数。
"""
ax.plot(hist.history['mse'], 'k', label='Train')
ax.plot(hist.history['val_mse'], 'b', label='Validation')
# plt.axvline(index,linestyle='--', color='k')
ax.set_ylabel('MSE')
ax.set_xlabel('Epoch')
# ax.title('Mean Squared Error')
# model.save("./assert/temp/bp_keras_model.keras")
# pd.DataFrame(hist.history).to_csv('./assert/bp_model_loss.csv')
plt.legend()
plt.show()

In [None]:
params = {
    "objective": 'mse',
    'max_depth': 4,
    "n_estimators": 100,
    "learning_rate": 0.1,
    'min_child_samples': 6,
    'reg_alpha': 0,
    'reg_lambda': 0,
    "force_col_wise": True,
    "subsample": 0.8,
    'colsample_bytree': 0.8,
    # "num_leaves": 7  # 一般设置为(0, 2^max_depth - 1]的一个数值。是一个需要重点调节的参数，对模型性能影响很大。
}
param_range = np.arange(2, 7, 1)
scores = {}
for i in param_range:
    model = LGBMRegressor(n_jobs=-1, random_state=rs, verbosity=-1, num_leaves=i, **params)
    model.fit(X_train_s, y_train)
    scores[i] = model.score(X_test_s, y_test)

temp_data = pd.Series(scores)
# _ind = min(scores, key=scores.get)
plt.figure(figsize=[20, 5])
temp_data.plot()
plt.show()