In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error


rs = 132
dataset = pd.read_csv('./data/dataset.csv')
dataset.set_index('股票简称', inplace=True)
features = [
    '净资产收益率(%)', '资产报酬率(%)', 'EBITDA率(%)', '营业利润率(%)', '投入资本回报率(%)', 
    '资产负债率(%)', '权益乘数(%)', '速动比率(%)', '现金流动负债比率(%)', '长期资本负债率(%)',
    '营业收入增长率(%)', '资本保值增值率(%)', '总资产增长率(%)', '资本积累率(%)', '营业利润增长率(%)',
    '总资产周转率', '应收账款周转率', '流动资产周转率', '存货周转率', '现金资产比率(%)', 
    '数字技术应用', '商业模式变革', '智能制造', '现代信息系统',
    '客户集中度(%)', '供应商集中度(%)', '成本费用利润率(%)',
    '研发人员占比(%)', '研发营收比(%)', '发明专利申请数',
    '两权分离率(%)', '独董比例(%)', '董事会规模','股权集中度(%)',
    '员工人均营收比(%)', '提供岗位增长率(%)', '员工收入增长率(%)',
]
label_name = '因子得分'
unit_map = dataset[['股票代码', '行业代码', '所属省份']].to_dict()
# 获取数据集和标签值
y : pd.Series = dataset[label_name]
X : pd.DataFrame = dataset[features].copy(deep=True).astype("float")
# 数据预处理：1.极差标准化；2.数据集划分。
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=rs)
scaler = MinMaxScaler()
scaler.fit(X_train)

X_train_s = scaler.transform(X_train)
X_test_s = scaler.transform(X_test)

In [None]:
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
# from sklearn.linear_model import LinearRegression
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.tree import DecisionTreeRegressor

params= {'ccp_alpha': 0.7814179316082759, 'max_depth': 11, 'min_samples_leaf': 1, 'min_samples_split': 2}
model = DecisionTreeRegressor(max_features="sqrt", random_state=rs, **params)
model.fit(X_train_s, y_train)
print(model.score(X_test_s, y_test))
params = {
    'max_depth': 13,
    'max_features': 11,
    'min_samples_leaf': 2,
    'min_samples_split': 2,
    'n_estimators': 7
}
model = RandomForestRegressor(verbose=0, random_state=rs, **params)
model.fit(X_train_s, y_train)
print(model.score(X_test_s, y_test))
params = {
    "objective": 'reg:squarederror',
    'colsample_bytree': 0.92,
    'gamma': 0.1,
    'learning_rate': 0.2,
    'max_depth': 11, 
    'min_child_weight': 5,
    'n_estimators': 21,
    'subsample': 0.5,
    "reg_alpha": 0,
    "reg_lambda": 0,
    "scale_pos_weight": 1
}
model = XGBRegressor(random_state=rs, **params, n_jobs=-1)
model.fit(X_train_s, y_train)
print(model.score(X_test_s, y_test))

params = {
    "iterations": 300,
    "learning_rate": 0.15,  # 可能处于 0.01~0.5或0.6~0.9之间，前者可能性更大
    "depth": 3,
    "l2_leaf_reg": 0.6,
    'bagging_temperature': 0.1,
    "border_count": 181  # 越大越好，
}
model = CatBoostRegressor(random_state=rs, verbose=0, train_dir=None, **params)
model.fit(X_train_s, y_train)
print(model.score(X_test_s, y_test))


params = {
    "objective": 'mse',
    'max_depth': 3,
    "n_estimators": 200,
    'learning_rate': 0.15,
    'min_child_samples': 7, 
    'reg_alpha': 0,
    'reg_lambda': 0,
    "force_col_wise": True,
    "subsample": 0.8,
    'colsample_bytree': 0.32,
    "num_leaves": 6
}
model = LGBMRegressor(n_jobs=-1, random_state=rs, verbosity=-1, **params)
model.fit(X_train_s, y_train)
print(model.score(X_test_s, y_test))