In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
import lightgbm as lgb

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
# 设定递归特征选择后的分子描述符
selected_features = ['BertzCT', 'Chi2v', 'Chi4n', 'Chi4v', 
                     'SMR_VSA3', 'SMR_VSA9', 'SlogP_VSA8', 
                     'TPSA', 'MolMR', 'fr_NH0']


In [3]:
# 读取数据
df = pd.read_csv(r"D:\毕设文章里的代码和数据\毕设文章里的代码和数据\代码\回归\data\allpi3k分子描述符内部数据集.csv",encoding = 'gbk')

In [4]:
# 计算摩根指纹
fingerprints = []
for s in df['smiles']:
    mol = Chem.MolFromSmiles(s)
    if mol is None:
        fingerprints.append(None)
    else:
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
        arr = np.zeros((2048,), dtype=int)
        DataStructs.ConvertToNumpyArray(fp, arr)
        fingerprints.append(arr)

df['fingerprint'] = fingerprints
df = df[df['fingerprint'].notnull()].reset_index(drop=True)  # 删除无效 SMILES
fingerprint_array = np.array(df['fingerprint'].tolist())  # 转换为 NumPy 数组

# 提取分子描述符
descriptors = df[selected_features]




In [5]:
from rdkit import Chem
from rdkit.Chem import MACCSkeys
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

# 计算 MACCS 指纹
def compute_maccs(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        maccs = MACCSkeys.GenMACCSKeys(mol)
        return np.array(maccs)
    else:
        return np.zeros((167,))  # MACCS指纹长度为167

# 计算MACCS指纹数组
maccs_fingerprints = np.array([compute_maccs(smiles) for smiles in df['smiles']])

# 标准化分子描述符
scaler = StandardScaler()
X_scaled = scaler.fit_transform(descriptors)

# 合并摩根指纹、MACCS指纹和标准化的描述符
X_combined = np.hstack([X_scaled, fingerprint_array, maccs_fingerprints])
y = df['pIC50'].values

# 划分数据集（70% 训练, 15% 验证, 15% 测试）
# X_train, X_temp, y_train, y_temp = train_test_split(X_combined, y, test_size=0.3, random_state=42)
# X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.15, random_state=42)

In [6]:
# import pickle
# import numpy as np
# import lightgbm as lgb
# import xgboost as xgb
# from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, StackingRegressor
# from sklearn.neighbors import KNeighborsRegressor
# from sklearn.neural_network import MLPRegressor
# from sklearn.metrics import r2_score, mean_squared_error

# # --------------------------
# # 假设 X_train, y_train, X_val, y_val, X_test, y_test, scaler, selected_features 已经定义
# # 为了使模型获得更多数据，我们将训练集与验证集合并作为最终的训练数据
# X_train_full = np.concatenate([X_train, X_val])
# y_train_full = np.concatenate([y_train, y_val])

# # --------------------------
# # 定义各个基学习器（基于 scikit-learn 接口）
# # --------------------------
# # LightGBM 模型
# lgb_reg = lgb.LGBMRegressor(
#     n_estimators=500,
#     learning_rate=0.05,
#     num_leaves=31,
#     max_depth=5,
#     reg_alpha=0.1,          # 对应 lambda_l1
#     reg_lambda=0.2,         # 对应 lambda_l2
#     min_child_samples=20,   # 对应 min_data_in_leaf
#     random_state=42
# )

# # XGBoost 模型
# xgb_reg = xgb.XGBRegressor(
#     n_estimators=500,
#     learning_rate=0.05,
#     max_depth=5,
#     reg_alpha=0.1,          # L1 正则化
#     reg_lambda=0.2,         # L2 正则化
#     objective='reg:squarederror',
#     random_state=42
# )

# # 随机森林模型
# rf_model = RandomForestRegressor(
#     n_estimators=100,
#     max_depth=10,
#     random_state=42
# )

# # KNN 模型
# knn_model = KNeighborsRegressor()  # 默认参数，可根据需要调整

# # ExtraTrees 模型（ETR）
# etr_model = ExtraTreesRegressor(
#     n_estimators=100,
#     max_depth=10,
#     random_state=42
# )

# # MLP 模型
# mlp_model = MLPRegressor(
#     hidden_layer_sizes=(100, ),
#     activation='relu',
#     solver='adam',
#     max_iter=500,
#     random_state=42
# )

# # --------------------------
# # 构建 Stacking 集成模型
# # --------------------------
# # 使用 XGBoost 模型作为元学习器进行融合预测
# stacking_reg = StackingRegressor(
#     estimators=[
#         ('lgb', lgb_reg),
#         ('xgb', xgb_reg),
#         ('rf', rf_model),
#         ('knn', knn_model),
#         ('etr', etr_model),
#         ('mlp', mlp_model)
#     ],
#     final_estimator=xgb.XGBRegressor(
#         n_estimators=200,
#         learning_rate=0.1,
#         max_depth=3,
#         objective='reg:squarederror',
#         random_state=42
#     ),
#     cv=5,
#     n_jobs=-1
# )

# # --------------------------
# # 训练 stacking 模型
# # --------------------------
# stacking_reg.fit(X_train_full, y_train_full)

# # --------------------------
# # 模型评估函数
# # --------------------------
# def evaluate_model(name, model, X_train, y_train, X_val, y_val, X_test, y_test):
#     y_train_pred = model.predict(X_train)
#     y_val_pred = model.predict(X_val)
#     y_test_pred = model.predict(X_test)
#     print(f"==== {name} Performance ====")
#     print(f"Train  R²: {r2_score(y_train, y_train_pred):.3f} | Train  MSE: {mean_squared_error(y_train, y_train_pred):.3f}")
#     print(f"Valid  R²: {r2_score(y_val, y_val_pred):.3f} | Valid  MSE: {mean_squared_error(y_val, y_val_pred):.3f}")
#     print(f"Test   R²: {r2_score(y_test, y_test_pred):.3f} | Test   MSE: {mean_squared_error(y_test, y_test_pred):.3f}")
#     print("\n")

# # --------------------------
# # 分别评估各个单模型（基学习器）的表现
# # --------------------------
# lgb_reg.fit(X_train_full, y_train_full)
# evaluate_model("LightGBM", lgb_reg, X_train, y_train, X_val, y_val, X_test, y_test)

# xgb_reg.fit(X_train_full, y_train_full)
# evaluate_model("XGBoost", xgb_reg, X_train, y_train, X_val, y_val, X_test, y_test)

# rf_model.fit(X_train_full, y_train_full)
# evaluate_model("Random Forest", rf_model, X_train, y_train, X_val, y_val, X_test, y_test)

# knn_model.fit(X_train_full, y_train_full)
# evaluate_model("KNN", knn_model, X_train, y_train, X_val, y_val, X_test, y_test)

# etr_model.fit(X_train_full, y_train_full)
# evaluate_model("ExtraTrees", etr_model, X_train, y_train, X_val, y_val, X_test, y_test)

# mlp_model.fit(X_train_full, y_train_full)
# evaluate_model("MLP", mlp_model, X_train, y_train, X_val, y_val, X_test, y_test)

# # --------------------------
# # 评估 Stacking 集成模型
# # --------------------------
# evaluate_model("Stacking Ensemble", stacking_reg, X_train, y_train, X_val, y_val, X_test, y_test)

# # --------------------------
# # 保存模型及预处理对象
# # --------------------------
# model_dict = {
#     'stacking_model3': stacking_reg,
#     'scaler': scaler,
#     'selected_features': selected_features
# }
# model_path = "stacking_ensemble_XGBoost_model.pkl"
# with open(model_path, 'wb') as f:
#     pickle.dump(model_dict, f)
# print(f"Models saved to {model_path}")


In [7]:
# 下面是一个完整示例，展示了如何使用 LightGBM、XGBoost、随机森林、KNN、ExtraTrees（ETR）和 MLP 模型作为基学习器，通过 stacking 集成（以线性回归作为元学习器）来进行回归任务。代码中假设数据集（X_train, y_train, X_val, y_val, X_test, y_test）以及预处理对象（scaler、selected_features）已提前准备好。请根据实际情况调整参数和预处理流程。

# ```python
import pickle
import numpy as np
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, StackingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

# --------------------------
# 假设 X_train, y_train, X_val, y_val, X_test, y_test, scaler, selected_features 已经定义
# # 为了使模型获得更多数据，我们将训练集与验证集合并作为最终的训练数据
# X_train_full = np.concatenate([X_train, X_val])
# y_train_full = np.concatenate([y_train, y_val])

# --------------------------
# 定义各个基学习器（基于 scikit-learn 接口）
# --------------------------
# LightGBM 模型
lgb_reg = lgb.LGBMRegressor(
    n_estimators=500,
    learning_rate=0.05,
    num_leaves=31,
    max_depth=5,
    reg_alpha=0.1,          # 对应 lambda_l1
    reg_lambda=0.2,         # 对应 lambda_l2
    min_child_samples=20,   # 对应 min_data_in_leaf
    random_state=42
)

# XGBoost 模型
xgb_reg = xgb.XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=5,
    reg_alpha=0.1,          # L1 正则化
    reg_lambda=0.2,         # L2 正则化
    objective='reg:squarederror',
    random_state=42
)

# 随机森林模型
rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    random_state=42
)

# KNN 模型
knn_model = KNeighborsRegressor()  # 默认参数，可根据需要调整

# ExtraTrees 模型（ETR）
etr_model = ExtraTreesRegressor(
    n_estimators=100,
    max_depth=10,
    random_state=42
)

# MLP 模型
mlp_model = MLPRegressor(
    hidden_layer_sizes=(100, ),
    activation='relu',
    solver='adam',
    max_iter=500,
    random_state=42
)

# --------------------------
# 构建 Stacking 集成模型
# --------------------------
# 使用线性回归作为元学习器进行融合预测
stacking_reg = StackingRegressor(
    estimators=[
        ('lgb', lgb_reg),
        ('xgb', xgb_reg),
        ('rf', rf_model),
        ('knn', knn_model),
        ('etr', etr_model),
        ('mlp', mlp_model)
    ],
    final_estimator=LinearRegression(),
    cv=5,
    n_jobs=-1
)

# --------------------------
# 训练 stacking 模型
# --------------------------
stacking_reg.fit(X_train, y_train)

# --------------------------
# 模型评估函数
# --------------------------
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error  # 新增导入MAE

def evaluate_model(name, model, X_train, y_train, X_test, y_test):  # 注意参数列表需要包含X_val和y_val
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    print(f"\n==== {name} Performance ====")
    # 添加MAE指标，保持对齐格式
    print(f"Train  R²: {r2_score(y_train, y_train_pred):.4f} | "
          f"MSE: {mean_squared_error(y_train, y_train_pred):.4f} | "
          f"MAE: {mean_absolute_error(y_train, y_train_pred):.4f}")
    
    print(f"Test   R²: {r2_score(y_test, y_test_pred):.4f} | "
          f"MSE: {mean_squared_error(y_test, y_test_pred):.4f} | "
          f"MAE: {mean_absolute_error(y_test, y_test_pred):.4f}")
    print("\n")

# --------------------------
# 分别评估各个单模型（基学习器）的表现
# --------------------------
# 为了比较效果，这里分别对各模型进行在合并数据集上的训练和单独评估
lgb_reg.fit(X_train, y_train)
evaluate_model("LightGBM", lgb_reg, X_train, y_train, X_test, y_test)

xgb_reg.fit(X_train, y_train)
evaluate_model("XGBoost", xgb_reg, X_train, y_train,X_test, y_test)

rf_model.fit(X_train, y_train)
evaluate_model("Random Forest", rf_model, X_train, y_train, X_test, y_test)

knn_model.fit(X_train, y_train)
evaluate_model("KNN", knn_model, X_train, y_train,  X_test, y_test)

etr_model.fit(X_train, y_train)
evaluate_model("ExtraTrees", etr_model, X_train, y_train, X_test, y_test)

mlp_model.fit(X_train, y_train)
evaluate_model("MLP", mlp_model, X_train, y_train, X_test, y_test)

# --------------------------
# 评估 Stacking 集成模型
# --------------------------
evaluate_model("Stacking Ensemble", stacking_reg, X_train, y_train, X_test, y_test)

# --------------------------
# 保存模型及预处理对象
# --------------------------
model_dict = {
    'stacking_model2': stacking_reg,
    'scaler': scaler,
    'selected_features': selected_features
}
model_path = "stacking_ensemble_regression_model2.pkl"
with open(model_path, 'wb') as f:
    pickle.dump(model_dict, f)
print(f"Models saved to {model_path}")


# ### 说明

# - **基学习器**：代码中分别定义了 LightGBM、XGBoost、随机森林、KNN、ExtraTrees（ETR）和 MLP 模型。  
# - **Stacking 集成**：使用 `StackingRegressor` 将上述模型的预测结果作为输入特征，通过 5 折交叉验证构造元特征，并采用线性回归作为最终元学习器。  
# - **数据准备**：将训练集与验证集合并，以获得更多训练数据用于模型训练；各模型在单独数据集上的评估帮助您观察各自表现。  
# - **模型保存**：将最终的 stacking 模型以及预处理对象保存到 pickle 文件中，方便后续加载使用。

# 请根据实际情况对模型参数、预处理和评估策略进行适当调整。

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.080620 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6009
[LightGBM] [Info] Number of data points in the train set: 8845, number of used features: 1900
[LightGBM] [Info] Start training from score 6.971937

==== LightGBM Performance ====
Train  R²: 0.7849 | MSE: 0.3246 | MAE: 0.4295
Test   R²: 0.6991 | MSE: 0.4557 | MAE: 0.5075







==== XGBoost Performance ====
Train  R²: 0.8268 | MSE: 0.2613 | MAE: 0.3924
Test   R²: 0.7103 | MSE: 0.4387 | MAE: 0.5012



==== Random Forest Performance ====
Train  R²: 0.6262 | MSE: 0.5641 | MAE: 0.5752
Test   R²: 0.5370 | MSE: 0.7011 | MAE: 0.6426



==== KNN Performance ====
Train  R²: 0.8138 | MSE: 0.2810 | MAE: 0.3780
Test   R²: 0.7253 | MSE: 0.4159 | MAE: 0.4596



==== ExtraTrees Performance ====
Train  R²: 0.5844 | MSE: 0.6272 | MAE: 0.5964
Test   R²: 0.4946 | MSE: 0.7653 | MAE: 0.6681



==== MLP Performance ====
Train  R²: 0.9740 | MSE: 0.0392 | MAE: 0.1349
Test   R²: 0.6084 | MSE: 0.5929 | MAE: 0.5625







==== Stacking Ensemble Performance ====
Train  R²: 0.8839 | MSE: 0.1752 | MAE: 0.3070
Test   R²: 0.7600 | MSE: 0.3635 | MAE: 0.4372


Models saved to stacking_ensemble_regression_model2.pkl


In [8]:
# import pickle
# import numpy as np
# import lightgbm as lgb
# import xgboost as xgb
# from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, StackingRegressor
# from sklearn.neighbors import KNeighborsRegressor
# from sklearn.neural_network import MLPRegressor
# from sklearn.linear_model import LinearRegression
# from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# # --------------------------
# # 假设 X_train, y_train, X_val, y_val, X_test, y_test, scaler, selected_features 已经定义
# # 为了使模型获得更多数据，我们将训练集与验证集合并作为最终的训练数据
# X_train_full = np.concatenate([X_train, X_val])
# y_train_full = np.concatenate([y_train, y_val])

# # --------------------------
# # 定义各个基学习器（基于 scikit-learn 接口）
# # --------------------------
# # LightGBM 模型
# lgb_reg = lgb.LGBMRegressor(
#     n_estimators=500,
#     learning_rate=0.05,
#     num_leaves=31,
#     max_depth=5,
#     reg_alpha=0.1,          # 对应 lambda_l1
#     reg_lambda=0.2,         # 对应 lambda_l2
#     min_child_samples=20,   # 对应 min_data_in_leaf
#     random_state=42
# )

# # XGBoost 模型
# xgb_reg = xgb.XGBRegressor(
#     n_estimators=500,
#     learning_rate=0.05,
#     max_depth=5,
#     reg_alpha=0.1,          # L1 正则化
#     reg_lambda=0.2,         # L2 正则化
#     objective='reg:squarederror',
#     random_state=42
# )

# # 随机森林模型
# rf_model = RandomForestRegressor(
#     n_estimators=100,
#     max_depth=10,
#     random_state=42
# )

# # KNN 模型
# knn_model = KNeighborsRegressor()  # 默认参数，可根据需要调整

# # ExtraTrees 模型（ETR）
# etr_model = ExtraTreesRegressor(
#     n_estimators=100,
#     max_depth=10,
#     random_state=42
# )

# # MLP 模型
# mlp_model = MLPRegressor(
#     hidden_layer_sizes=(100, ),
#     activation='relu',
#     solver='adam',
#     max_iter=500,
#     random_state=42
# )

# # --------------------------
# # 构建 Stacking 集成模型
# # --------------------------
# # 使用线性回归作为元学习器进行融合预测
# stacking_reg = StackingRegressor(
#     estimators=[
#         ('lgb', lgb_reg),
#         ('xgb', xgb_reg),
#         ('rf', rf_model),
#         ('knn', knn_model),
#         ('etr', etr_model),
#         ('mlp', mlp_model)
#     ],
#     final_estimator=LinearRegression(),
#     cv=5,
#     n_jobs=-1
# )

# # --------------------------
# # 训练 stacking 模型
# # --------------------------
# stacking_reg.fit(X_train_full, y_train_full)

# # --------------------------
# # 模型评估函数（增加 MAE 指标）
# # --------------------------
# def evaluate_model(name, model, X_train, y_train, X_val, y_val, X_test, y_test):
#     y_train_pred = model.predict(X_train)
#     y_val_pred = model.predict(X_val)
#     y_test_pred = model.predict(X_test)
#     print(f"==== {name} Performance ====")
#     print(f"Train  R²: {r2_score(y_train, y_train_pred):.3f} | Train  MSE: {mean_squared_error(y_train, y_train_pred):.3f} | Train  MAE: {mean_absolute_error(y_train, y_train_pred):.3f}")
#     print(f"Valid  R²: {r2_score(y_val, y_val_pred):.3f} | Valid  MSE: {mean_squared_error(y_val, y_val_pred):.3f} | Valid  MAE: {mean_absolute_error(y_val, y_val_pred):.3f}")
#     print(f"Test   R²: {r2_score(y_test, y_test_pred):.3f} | Test   MSE: {mean_squared_error(y_test, y_test_pred):.3f} | Test   MAE: {mean_absolute_error(y_test, y_test_pred):.3f}")
#     print("\n")
#     return y_train_pred, y_val_pred, y_test_pred  # 返回预测结果，便于后续绘图使用

# # --------------------------
# # 分别评估各个单模型（基学习器）的表现
# # --------------------------
# # 为了比较效果，这里分别对各模型进行在合并数据集上的训练和单独评估
# lgb_reg.fit(X_train_full, y_train_full)
# train_pred_lgb, val_pred_lgb, test_pred_lgb = evaluate_model("LightGBM", lgb_reg, X_train, y_train, X_val, y_val, X_test, y_test)

# xgb_reg.fit(X_train_full, y_train_full)
# train_pred_xgb, val_pred_xgb, test_pred_xgb = evaluate_model("XGBoost", xgb_reg, X_train, y_train, X_val, y_val, X_test, y_test)

# rf_model.fit(X_train_full, y_train_full)
# train_pred_rf, val_pred_rf, test_pred_rf = evaluate_model("Random Forest", rf_model, X_train, y_train, X_val, y_val, X_test, y_test)

# knn_model.fit(X_train_full, y_train_full)
# train_pred_knn, val_pred_knn, test_pred_knn = evaluate_model("KNN", knn_model, X_train, y_train, X_val, y_val, X_test, y_test)

# etr_model.fit(X_train_full, y_train_full)
# train_pred_etr, val_pred_etr, test_pred_etr = evaluate_model("ExtraTrees", etr_model, X_train, y_train, X_val, y_val, X_test, y_test)

# mlp_model.fit(X_train_full, y_train_full)
# train_pred_mlp, val_pred_mlp, test_pred_mlp = evaluate_model("MLP", mlp_model, X_train, y_train, X_val, y_val, X_test, y_test)

# # --------------------------
# # 评估 Stacking 集成模型
# # --------------------------
# train_pred_stack, val_pred_stack, test_pred_stack = evaluate_model("Stacking Ensemble", stacking_reg, X_train, y_train, X_val, y_val, X_test, y_test)

# # --------------------------
# # 保存模型及预处理对象
# # --------------------------
# model_dict = {
#     'stacking_model2': stacking_reg,
#     'scaler': scaler,
#     'selected_features': selected_features
# }
# model_path = "stacking_ensemble_regression_model2.pkl"
# with open(model_path, 'wb') as f:
#     pickle.dump(model_dict, f)
# print(f"Models saved to {model_path}")


In [9]:
# import matplotlib.pyplot as plt
# import pandas as pd

# # 组织绘图数据：假设使用 stacking 模型的预测结果
# # 构造 DataFrame，添加数据集类型列
# data_train = pd.DataFrame({'y_true': y_train, 'y_pred': train_pred_stack})
# data_train['dataset'] = 'Train'
# data_val = pd.DataFrame({'y_true': y_val, 'y_pred': val_pred_stack})
# data_val['dataset'] = 'Validation'
# data_test = pd.DataFrame({'y_true': y_test, 'y_pred': test_pred_stack})
# data_test['dataset'] = 'Test'

# # 合并所有数据
# scatter_data = pd.concat([data_train, data_val, data_test], axis=0).reset_index(drop=True)

# # 保存散点图数据到 CSV 文件
# csv_output_path = "D:\图数据\Stacking-内部散点图数据scatter_plot_data.csv"
# scatter_data.to_csv(csv_output_path, index=False)
# print(f"Scatter plot data saved to {csv_output_path}")

# # 绘制散点图：不同数据集用不同颜色，散点设置为空心
# plt.figure(figsize=(8, 6), dpi=300)  # dpi=300 保证高清

# markers = {'Train': 'o', 'Validation': 's', 'Test': '^'}
# colors = {'Train': 'blue', 'Validation': 'green', 'Test': 'red'}

# for ds in scatter_data['dataset'].unique():
#     ds_data = scatter_data[scatter_data['dataset'] == ds]
#     plt.scatter(ds_data['y_true'], ds_data['y_pred'], 
#                 label=ds, alpha=0.8, marker=markers[ds],
#                 facecolors='none', edgecolors=colors[ds], s=60)

# # 绘制 y=x 参考线
# min_val = scatter_data[['y_true', 'y_pred']].min().min()
# max_val = scatter_data[['y_true', 'y_pred']].max().max()
# plt.plot([min_val, max_val], [min_val, max_val], 'k--', lw=1, label='Ideal')

# plt.xlabel("True Value")
# plt.ylabel("Predicted Value")
# plt.title("Scatter Plot of True vs Predicted Values (Stacking Ensemble)")
# plt.legend()
# plt.tight_layout()

# # 保存高清图像
# image_output_path = "D:\图数据\Stacking-内部散点图scatter_plot_high_res.png"
# plt.savefig(image_output_path, dpi=300)
# print(f"High resolution scatter plot saved to {image_output_path}")

# plt.show()


In [10]:
# # 评估 Stacking 集成模型
# # --------------------------
# evaluate_model("Stacking Ensemble", stacking_reg, X_train, y_train, X_val, y_val, X_test, y_test)

# # --------------------------
# # 保存模型及预处理对象
# # --------------------------
# model_dict = {
#     'stacking_model2': stacking_reg,
#     'scaler': scaler,
#     'selected_features': selected_features
# }
# model_path = "stacking_ensemble_regression_model2.pkl"
# with open(model_path, 'wb') as f:
#     pickle.dump(model_dict, f)
# print(f"Models saved to {model_path}")

In [11]:
import pickle

# --------------------------
# 加载模型及预处理对象
# --------------------------
model_path = "stacking_ensemble_regression_model2.pkl"

# 加载保存的模型字典
with open(model_path, 'rb') as f:
    loaded_model_dict = pickle.load(f)

# 提取模型和预处理对象
loaded_stacking_model2 = loaded_model_dict['stacking_model2']
loaded_scaler = loaded_model_dict['scaler']
loaded_selected_features = loaded_model_dict['selected_features']

print("模型和预处理对象加载成功！")

模型和预处理对象加载成功！


In [12]:
# import numpy as np
# import pandas as pd
# from rdkit import Chem, DataStructs
# from rdkit.Chem import AllChem, MACCSkeys
# from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
# import matplotlib.pyplot as plt

# # ----------------------------
# # 外部验证
# external_csv = rexternal_csv = rF:\回归数据\allpi3k分子描述符外部验证集.csv"
# df_ext = pd.read_csv(external_csv, encoding="gbk")

# # 计算外部数据的摩根指纹和MACCS指纹
# fingerprints_ext = []
# maccs_fingerprints_ext = []

# for s in df_ext['smiles']:
#     mol = Chem.MolFromSmiles(s)
#     if mol is None:
#         fingerprints_ext.append(None)
#         maccs_fingerprints_ext.append(None)
#     else:
#         # 计算摩根指纹
#         fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
#         arr_fp = np.zeros((2048,), dtype=int)
#         DataStructs.ConvertToNumpyArray(fp, arr_fp)
#         fingerprints_ext.append(arr_fp)
        
#         # 计算MACCS指纹
#         maccs = MACCSkeys.GenMACCSKeys(mol)
#         arr_maccs = np.array(maccs)
#         maccs_fingerprints_ext.append(arr_maccs)

# # 过滤掉指纹为空的分子
# df_ext['fingerprint'] = fingerprints_ext
# df_ext['maccs_fingerprint'] = maccs_fingerprints_ext
# df_ext = df_ext[df_ext['fingerprint'].notnull() & df_ext['maccs_fingerprint'].notnull()].reset_index(drop=True)

# # 转换为数组
# fingerprint_array_ext = np.array(df_ext['fingerprint'].tolist())
# maccs_fingerprint_array_ext = np.array(df_ext['maccs_fingerprint'].tolist())

# # 提取外部数据的描述符（注意：selected_features和scaler需与训练时保持一致）
# descriptors_ext = df_ext[selected_features]

# # 标准化描述符
# descriptors_ext_scaled = scaler.transform(descriptors_ext)

# # 合并摩根指纹、MACCS指纹和标准化描述符
# X_combined_ext = np.hstack([descriptors_ext_scaled, fingerprint_array_ext, maccs_fingerprint_array_ext])






In [13]:
# # 预测
# y_pred_ext = loaded_stacking_model2.predict(X_combined_ext)
# df_ext['pIC50_pred'] = y_pred_ext

# # 外部验证集性能评估
# y_ext = df_ext['pIC50']
# r2_ext = r2_score(y_ext, y_pred_ext)
# mse_ext = mean_squared_error(y_ext, y_pred_ext)
# mae_ext = mean_absolute_error(y_ext, y_pred_ext)

# print("==== External validation performance ====")
# print(f"R²: {r2_ext:.3f}")
# print(f"MSE: {mse_ext:.3f}")
# print(f"MAE: {mae_ext:.3f}")

# # 保存预测结果
# output_file = r"D:\图数据\Stacking-external_predictions.csv"
# df_ext.to_csv(output_file, index=False)
# print(f"External predictions saved to {output_file}")

In [14]:
# # ----------------------------
# # 绘制散点图，并保存散点图数据和高清图

# # 散点图数据：真实值 vs 预测值
# scatter_data = df_ext[['pIC50', 'pIC50_pred']]
# scatter_csv = r"D:\回归数据\scatter_data.csv"
# scatter_data.to_csv(scatter_csv, index=False)
# print(f"Scatter plot data saved to {scatter_csv}")

# # 绘制散点图
# plt.figure(figsize=(8, 6))
# plt.scatter(y_ext, y_pred_ext, alpha=0.7, edgecolors='k')
# plt.plot([y_ext.min(), y_ext.max()], [y_ext.min(), y_ext.max()], 'r--', lw=2)
# plt.xlabel('True pIC50')
# plt.ylabel('Predicted pIC50')
# plt.title('External Validation: True vs Predicted pIC50')
# plt.grid(True)

# # 保存高清散点图，dpi=300
# scatter_png = r"D:\图数据\Stacking-散点图catter_plot.png"
# plt.savefig(scatter_png, dpi=1000, bbox_inches='tight')
# print(f"Scatter plot image saved to {scatter_png}")

# plt.show()

In [15]:
from rdkit.Chem import MACCSkeys
# 外部验证
external_csv = rexternal_csv = r"D:\毕设文章里的代码和数据\毕设文章里的代码和数据\代码\回归\data\allpi3k分子描述符外部验证集.csv"
df_ext = pd.read_csv(external_csv, encoding="gbk")

# 计算外部数据的摩根指纹和MACCS指纹
fingerprints_ext = []
maccs_fingerprints_ext = []

for s in df_ext['smiles']:
    mol = Chem.MolFromSmiles(s)
    if mol is None:
        fingerprints_ext.append(None)
        maccs_fingerprints_ext.append(None)
    else:
        # 计算摩根指纹
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
        arr_fp = np.zeros((2048,), dtype=int)
        DataStructs.ConvertToNumpyArray(fp, arr_fp)
        fingerprints_ext.append(arr_fp)
        
        # 计算MACCS指纹
        maccs = MACCSkeys.GenMACCSKeys(mol)
        arr_maccs = np.array(maccs)
        maccs_fingerprints_ext.append(arr_maccs)

# 过滤掉指纹为空的分子
df_ext['fingerprint'] = fingerprints_ext
df_ext['maccs_fingerprint'] = maccs_fingerprints_ext
df_ext = df_ext[df_ext['fingerprint'].notnull() & df_ext['maccs_fingerprint'].notnull()].reset_index(drop=True)

# 转换为数组
fingerprint_array_ext = np.array(df_ext['fingerprint'].tolist())
maccs_fingerprint_array_ext = np.array(df_ext['maccs_fingerprint'].tolist())

# 提取外部数据的描述符
descriptors_ext = df_ext[selected_features]

# 标准化描述符
descriptors_ext_scaled = scaler.transform(descriptors_ext)

# 合并摩根指纹、MACCS指纹和标准化描述符
X_combined_ext = np.hstack([descriptors_ext_scaled, fingerprint_array_ext, maccs_fingerprint_array_ext])

# 预测
y_pred_ext = loaded_stacking_model2.predict(X_combined_ext)
df_ext['pIC50_pred'] = y_pred_ext

# 外部验证集性能评估
y_ext = df_ext['pIC50']
r2_ext = r2_score(y_ext, y_pred_ext)
mse_ext = mean_squared_error(y_ext, y_pred_ext)
mae_ext = mean_absolute_error(y_ext, y_pred_ext)  # 新增MAE计算

print("==== External validation performance ====")
print(f"R²:  {r2_ext:.4f}")
print(f"MSE: {mse_ext:.4f}")
print(f"MAE: {mae_ext:.4f}")  # 新增MAE输出

# # 保存预测结果
# output_file = "external_predictions.csv"
# df_ext.to_csv(output_file, index=False)
# print(f"External predictions saved to {output_file}")




==== External validation performance ====
R²:  0.7112
MSE: 0.4502
MAE: 0.4759


In [2]:
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, MACCSkeys, Descriptors
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import lightgbm as lgb

# ============================
# 天然产物预测

# 载入天然产物数据
natural_product_excel = r"C:\Users\dell\Desktop\CHENBEL_fenlei8.xlsx"
df_np = pd.read_excel(natural_product_excel)

# 计算天然产物数据的摩根指纹和MACCS指纹ce
fingerprints_np = []
maccs_fingerprints_np = []
valid_indices = []

for i, s in enumerate(df_np['SMILES']):
    # 检查SMILES是否为字符串且不为缺失值
    if not isinstance(s, str) or pd.isnull(s):
        fingerprints_np.append(None)
        maccs_fingerprints_np.append(None)
        continue

    mol = Chem.MolFromSmiles(s)
    if mol is None:
        fingerprints_np.append(None)
        maccs_fingerprints_np.append(None)
    else:
        # 计算摩根指纹
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
        arr_fp = np.zeros((2048,), dtype=int)
        DataStructs.ConvertToNumpyArray(fp, arr_fp)
        fingerprints_np.append(arr_fp)

        # 计算MACCS指纹
        maccs = MACCSkeys.GenMACCSKeys(mol)
        arr_maccs = np.array(maccs)
        maccs_fingerprints_np.append(arr_maccs)
        valid_indices.append(i)

# 将指纹添加到DataFrame中
df_np['fingerprint'] = fingerprints_np
df_np['maccs_fingerprint'] = maccs_fingerprints_np

# 过滤掉指纹为空的分子
df_np = df_np[df_np['fingerprint'].notnull() & df_np['maccs_fingerprint'].notnull()].reset_index(drop=True)

# 转换为数组
fingerprint_array_np = np.array(df_np['fingerprint'].tolist())
maccs_fingerprint_array_np = np.array(df_np['maccs_fingerprint'].tolist())

# 提取天然产物数据的描述符
selected_features = ['BertzCT', 'Chi2v', 'Chi4n', 'Chi4v', 'SMR_VSA3', 'SMR_VSA9', 'SlogP_VSA8', 'TPSA', 'MolMR', 'fr_NH0']

def calculate_descriptors(mol):
    """计算分子描述符"""
    if mol is None:
        return None
    descriptors = {}
    for desc_name in selected_features:
        try:
            # 调用描述符函数
            desc_value = getattr(Descriptors, desc_name)(mol)
            descriptors[desc_name] = desc_value
        except AttributeError:
            descriptors[desc_name] = np.nan
    return descriptors

# 计算并存储描述符
descriptors_list = []
for s in df_np['SMILES']:
    mol = Chem.MolFromSmiles(s)
    descriptors = calculate_descriptors(mol)
    if descriptors is not None:
        descriptors_list.append(descriptors)
    else:
        descriptors_list.append({feature: np.nan for feature in selected_features})

# 将描述符转换为DataFrame，并合并到原数据中
descriptors_df = pd.DataFrame(descriptors_list)
df_np = pd.concat([df_np, descriptors_df], axis=1)

# 检查描述符中是否存在缺失值
print("描述符缺失值统计：")
print(df_np[selected_features].isnull().sum())

# 对描述符进行缺失值填充（使用均值填充）
imputer = SimpleImputer(strategy='mean')
descriptors_np_imputed = imputer.fit_transform(df_np[selected_features])

# 标准化描述符
scaler = StandardScaler()
descriptors_np_scaled = scaler.fit_transform(descriptors_np_imputed)

# 合并摩根指纹、MACCS指纹和标准化描述符
X_combined_np = np.hstack([descriptors_np_scaled, fingerprint_array_np, maccs_fingerprint_array_np])

# 载入已经训练好的stacking模型（请确保变量loaded_stacking_model已定义）
# 示例：loaded_stacking_model = lgb.Booster(model_file='path_to_model.txt')
# 此处假设loaded_stacking_model已经加载完毕
# 注意：请确保使用的模型与输入特征匹配
# 如示例：
# loaded_stacking_model = lgb.Booster(model_file=r"F:\models\stacking_model.txt")

# 使用加载好的模型进行预测
y_pred_np = loaded_stacking_model2.predict(X_combined_np)
df_np['pIC50_pred'] = y_pred_np

# 只保留需要输出的列
#df_output = df_np[['英文名称', 'SMILES', 'Predicted Label','pIC50_pred']]
df_output = df_np[['new_id', 'SMILES', 'pIC50_pred']]

# 保存天然产物预测结果
output_file_np = r"C:\Users\dell\Desktop\CHENBEL_fenlei8_pIC50.xlsx"
df_output.to_excel(output_file_np, index=False)
print(f"Natural product predictions saved to {output_file_np}")


描述符缺失值统计：
BertzCT       0
Chi2v         0
Chi4n         0
Chi4v         0
SMR_VSA3      0
SMR_VSA9      0
SlogP_VSA8    0
TPSA          0
MolMR         0
fr_NH0        0
dtype: int64




NameError: name 'loaded_stacking_model2' is not defined