In [1]:
import warnings
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')


rs = 132
dataset = pd.read_csv('./data/dataset.csv')
dataset.set_index('股票简称', inplace=True)
features = [
    '净资产收益率(%)', '资产报酬率(%)', 'EBITDA率(%)', '营业利润率(%)', '投入资本回报率(%)', 
    '资产负债率(%)', '权益乘数(%)', '速动比率(%)', '现金流动负债比率(%)', '长期资本负债率(%)',
    '营业收入增长率(%)', '资本保值增值率(%)', '总资产增长率(%)', '资本积累率(%)', '营业利润增长率(%)',
    '总资产周转率', '应收账款周转率', '流动资产周转率', '存货周转率', '现金资产比率(%)', 
    '数字技术应用', '商业模式变革', '智能制造', '现代信息系统',
    '客户集中度(%)', '供应商集中度(%)', '成本费用利润率(%)',
    '研发人员占比(%)', '研发营收比(%)', '发明专利申请数',
    '两权分离率(%)', '独董比例(%)', '董事会规模','股权集中度(%)',
    '员工人均营收比(%)', '提供岗位增长率(%)', '员工收入增长率(%)',
]
label_name = '因子得分'
unit_map = dataset[['股票代码', '行业代码', '所属省份']].to_dict()
# 获取数据集和标签值
y : pd.Series = dataset[label_name]
X : pd.DataFrame = dataset[features].copy(deep=True).astype("float")
# 数据预处理：1.极差标准化；2.数据集划分。
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=rs)
scaler = MinMaxScaler()
scaler.fit(X_train)

X_train_s = scaler.transform(X_train)
X_test_s = scaler.transform(X_test)

In [3]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
# from utils.config import table_translate
from tensorflow import keras
from utils.methods import r2

metrics_matrix = pd.DataFrame(columns=["Model", "R2", "RMSE", "MAE"])
params= {'ccp_alpha': 0.7814179316082759, 'max_depth': 11, 'min_samples_leaf': 1, 'min_samples_split': 2}
model = DecisionTreeRegressor(max_features="sqrt", random_state=rs, **params)
model.fit(X_train_s, y_train)
y_pred = model.predict(X_test_s)
metrics_matrix.loc[metrics_matrix.shape[0]] = ["DecisionTree", r2_score(y_test, y_pred), mean_squared_error(y_test, y_pred),mean_absolute_error(y_test, y_pred)]


params = {'max_depth': 13, 'max_features': 11, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 7}
model = RandomForestRegressor(verbose=0, random_state=rs, **params)
model.fit(X_train_s, y_train)
y_pred = model.predict(X_test_s)
metrics_matrix.loc[metrics_matrix.shape[0]] = ["RandomForest", r2_score(y_test, y_pred), mean_squared_error(y_test, y_pred),mean_absolute_error(y_test, y_pred)]

params = {"objective": 'reg:squarederror', 'colsample_bytree': 0.92, 'gamma': 0.1, 'learning_rate': 0.2, 'max_depth': 11,  'min_child_weight': 5, 'n_estimators': 21, 'subsample': 0.5, "reg_alpha": 0, "reg_lambda": 0, "scale_pos_weight": 1}
model = XGBRegressor(random_state=rs, **params, n_jobs=-1)
model.fit(X_train_s, y_train)
y_pred = model.predict(X_test_s)
metrics_matrix.loc[metrics_matrix.shape[0]] = ["XGBoost", r2_score(y_test, y_pred), mean_squared_error(y_test, y_pred),mean_absolute_error(y_test, y_pred)]
# metrics_matrix.loc[metrics_matrix.shape[0]] = ["XGBoost", 0.897, 83.709, 6.061]

params = {"iterations": 300, "learning_rate": 0.15, "depth": 3, "l2_leaf_reg": 0.6, 'bagging_temperature': 0.1, "border_count": 181}
model = CatBoostRegressor(random_state=rs, verbose=0, train_dir=None, allow_writing_files=False, **params)
model.fit(X_train_s, y_train)
y_pred = model.predict(X_test_s)
metrics_matrix.loc[metrics_matrix.shape[0]] = ["CatBoost", r2_score(y_test, y_pred), mean_squared_error(y_test, y_pred),mean_absolute_error(y_test, y_pred)]

params = { "objective": 'mse', 'max_depth': 3, "n_estimators": 200, 'learning_rate': 0.15, 'min_child_samples': 7,  'reg_alpha': 0, 'reg_lambda': 0, "force_col_wise": True, "subsample": 0.8, 'colsample_bytree': 0.32, "num_leaves": 6}
model = LGBMRegressor(n_jobs=-1, random_state=rs, verbosity=-1, **params)
model.fit(X_train_s, y_train)
y_pred = model.predict(X_test_s)
metrics_matrix.loc[metrics_matrix.shape[0]] = ["LGBoost", r2_score(y_test, y_pred), mean_squared_error(y_test, y_pred),mean_absolute_error(y_test, y_pred)]

# Sequential 模型适用于普通层堆栈 其中，每层只有一个 input Tensor 和一个 Output Tensor。
model = keras.models.Sequential()
# model.add(keras.Input(shape=(X_train_s.shape[1], )))
model.add(keras.layers.Dense(units=30, activation='relu', name="layer1", input_shape=(X_train_s.shape[1], ),
                               kernel_regularizer=keras.regularizers.l2(0.02)))
model.add(keras.layers.Dropout(0.01))

# 增加输出层
model.add(keras.layers.Dense(units=1, name="output"))
optimizer = keras.optimizers.Adam(learning_rate=0.02)
model.compile(optimizer=optimizer, loss='mse', metrics=['mse', r2, 'mae'])
# 查看模型结构
# utils.plot_model(model, "./assert/feature_importance/bp_model_structure.png", show_shapes=True)
# model.summary()
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
hist = model.fit(X_train_s, y_train, validation_split=0.2, epochs=300, batch_size=30, shuffle=False, verbose=0, callbacks=[early_stopping])
# validation_split是训练集验证集拆分，epochs代表训练300轮，batch_size代表在批量梯度下降时每次选择16个样本，shuffle代表在训练过程中不会将数据反复打乱
# verbose：日志显示，0为不在标准输出流输出日志信息，1为输出进度条记录，2为每个epoch输出一行记录。
s = model.evaluate(X_test_s, y_test, verbose=0)
pd.DataFrame(hist.history).to_csv('./assert/temp/bp_model_loss.csv')
y_pred = model.predict(X_test_s)
metrics_matrix.loc[metrics_matrix.shape[0]] = ["ANN", r2_score(y_test, y_pred), mean_squared_error(y_test, y_pred),mean_absolute_error(y_test, y_pred)]
# metrics_matrix.loc[metrics_matrix.shape[0]] = ["ANN", 0.994, 5.215, 1.399]
metrics_matrix = metrics_matrix.round(3)
# table_translate(metrics_matrix, table_name="模型评估表", filename="机器学习建模阶段表格数据")
metrics_matrix 

2025-02-19 20:44:21.291779: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M4
2025-02-19 20:44:21.291799: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 24.00 GB
2025-02-19 20:44:21.291804: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 8.00 GB
I0000 00:00:1739969061.291819 1675911 pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
I0000 00:00:1739969061.291840 1675911 pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2025-02-19 20:44:21.525120: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 


Unnamed: 0,Model,R2,RMSE,MAE
0,DecisionTree,0.786,174.451,9.837
1,RandomForest,0.847,124.47,8.451
2,XGBoost,0.875,101.955,7.024
3,CatBoost,0.956,35.564,4.44
4,LGBoost,0.938,50.493,5.577
5,ANN,0.996,2.948,1.255


In [None]:
import shap
import numpy as np
from types import MethodType
import matplotlib.pyplot as plt
from sklearn.inspection import permutation_importance
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = 'simsun' # Times New Roman
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号

# font = {"fontsize":10, "fontfamily": "Songti SC"}  
# font = {"fontsize":10, "fontfamily": "simsun"} 
score = lambda self, X, y: self.evaluate(X, y, verbose=0)[1]

# (1)计算权重特征重要度；
# model = models.load_model("./assert/bp_keras_model.keras")
model.score = MethodType(score, model)
layer_weights = model.layers[0].get_weights()[0]
feature_importance = np.mean(np.abs(layer_weights), axis=1)  # 取绝对值的平均值

# (2)对训练集进行置换重要性分析：计算时间非常长，大概需要4分钟
# result = permutation_importance(model, X_train_s, y_train, n_repeats=200, random_state=42, n_jobs=-1)
# # n_repeats=20：进行50次随机替换
# # Bunch_result:[importance(置换重要度),importance_mean（置换重要度均值）,importance_std（置换重要度标准差）]
# result.importances_mean
# # Bunch_index = Bunch_result.importances_mean.argsort()
# ax.boxplot(Bunch_result.importances[Bunch_index].T, vert=False, labels=nlabels)  #  labels=Xnames

# 检验神经网络模型对 2010 年 164 家上市公司经营绩效的拟合效果，计算神经网络绩效评 价得分与经营绩效实际得分的 Pearson 相关系数，其值为 0.966，均方误差为 0.0013，平均绝 对误差为 0.027，平均相对误差为 3.89%，由此可知 BP 神经网络评价模型对 2010 年民营制造 业上市公司经营绩效的拟合效果非常好。从排名差异来看，由表 4.4 可知，排名前五的上市 公司实际排名和预测排名完全一致，排名后五的上市公司两者排名相差较小；检验排名的次 序相关程度，计算 Spearman 次序相关系数，其值为 0.922，说明预测排名与实际排名基本一 致，因此可以认为 BP 神经网络评价模型对 2010 年 164 家民营制造业上市公司经营绩效的拟 合效果非常好。
shap.initjs()
explainer = shap.DeepExplainer(model, X_train_s)
shap_values = explainer.shap_values(X_test_s)
_shap_values = shap_values.T[0].T

# fig, ax = plt.subplots(figsize=(8, 6), dpi=300)

# bars = ax.barh(features, feature_importance, left=0, height=0.5, color='skyblue')
shap.summary_plot(_shap_values, feature_names=features, sort=False)  # , plot_type="bar", plot_type='bar'

# 设置标题
# plt.title('各类别数据展示')
# ax.set_xlabel('重要度')
# ax.set_ylabel('特征')
# plt.grid(axis='x', alpha=0.5, linestyle='--')

# , transparent=True, bbox_inches='tight'
# plt.show()

In [None]:
"""'
任务：
1.如果计算时间过长，可以先使用 RandomizedSearchCV 进行粗略搜索，再用 GridSearchCV 精调。
2.对于 CatBoost 和 LightGBM，可以通过设置 early_stopping_rounds 参数加快训练。
3.特征重要性分析：
    •XGBoost: model.best_estimator_.feature_importances_
    •LightGBM: model.best_estimator_.booster_.feature_importance()
    •CatBoost: model.best_estimator_.get_feature_importance()


# 检验神经网络模型对 2010 年 164 家上市公司经营绩效的拟合效果，计算神经网络绩效评 价得分与经营绩效实际得分的 Pearson 相关系数，其值为 0.966，均方误差为 0.0013，平均绝 对误差为 0.027，平均相对误差为 3.89%，由此可知 BP 神经网络评价模型对 2010 年民营制造 业上市公司经营绩效的拟合效果非常好。从排名差异来看，由表 4.4 可知，排名前五的上市 公司实际排名和预测排名完全一致，排名后五的上市公司两者排名相差较小；检验排名的次 序相关程度，计算 Spearman 次序相关系数，其值为 0.922，说明预测排名与实际排名基本一 致，因此可以认为 BP 神经网络评价模型对 2010 年 164 家民营制造业上市公司经营绩效的拟 合效果非常好。
"""