In [3]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np
import pickle
# Set random seed
np.random.seed(30)
# 读取Excel文件
df = pd.read_excel('LCK_ac.xlsx')

# 将SMILES字符串转换为分子对象
ms = [Chem.MolFromSmiles(x) for x in df['smiles']]

# 创建新列来保存Atom Pair指纹
df['AP_FP'] = [AllChem.GetHashedAtomPairFingerprintAsBitVect(x) for x in ms]

with open('ap_fingerprints.pkl', 'wb') as f:
    pickle.dump(df['AP_FP'].values, f)

# Save labels
with open('labels.pkl', 'wb') as f:
    pickle.dump(df['pIC50'].values, f)

with open('ap_fingerprints.pkl', 'rb') as f:
    ap_fingerprints = pickle.load(f)

# Load labels
with open('labels.pkl', 'rb') as f:
    labels = pickle.load(f)

ap_fingerprints = np.array(list(ap_fingerprints))

# Split to separate out the training and test set
X_train, X_test, y_train, y_test = train_test_split(ap_fingerprints, labels, test_size=0.1, random_state=42)


In [None]:
###LightGBM
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression

# 定义基础模型
base_learners = [
    ('rf', RandomForestRegressor()),
    ('xgb', XGBRegressor(tree_method='gpu_hist', 
                         subsample=1, 
                         n_estimators=500, 
                         min_child_weight=5, 
                         max_depth=10, 
                         learning_rate=0.01, 
                         reg_lambda=0.1, 
                         colsample_bytree=0.5, 
                         reg_alpha=0)),
    ('knn', KNeighborsRegressor(metric='manhattan', 
                                n_neighbors=5, 
                                weights='distance')),
    ('lgbm', LGBMRegressor())
]

# 定义元模型
meta_learner = LGBMRegressor()

# 定义堆叠模型
stacking_model = StackingRegressor(estimators=base_learners, 
                                   final_estimator=meta_learner)


# 拟合数据
stacking_model.fit(X_train, y_train)

# 预测
stacking_train_predictions = stacking_model.predict(X_train)
stacking_predictions = stacking_model.predict(X_test)



# Calculate and print R^2 score
stacking_train_r2 = r2_score(y_train, stacking_train_predictions)
stacking_test_r2 = r2_score(y_test, stacking_predictions)
print("Train R^2 score: ", stacking_train_r2)
print("Test R^2 score: ", stacking_test_r2)

# Calculate and print RMSE
stacking_rmse_train = np.sqrt(mean_squared_error(y_train, stacking_train_predictions))
stacking_rmse_test = np.sqrt(mean_squared_error(y_test, stacking_predictions))
print(f"Train RMSE: {stacking_rmse_train}")
print(f"Test RMSE: {stacking_rmse_test}")


In [5]:
with open("LG_stacking_model.pkl", "wb") as file:
    pickle.dump(stacking_model, file)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.gridspec import GridSpec

# 使用 seaborn 默认的主题
sns.set_theme()

# 创建一个新的图表，设置分辨率为 300 dpi
fig = plt.figure(figsize=(10, 10), dpi=1200)

# 创建网格
gs = GridSpec(4,4)

# 创建子图
ax_joint = fig.add_subplot(gs[1:4,0:3])
ax_marg_x = fig.add_subplot(gs[0,0:3])
ax_marg_y = fig.add_subplot(gs[1:4,3])

# 绘制散点图和直方图
ax_joint.scatter(y_train, stacking_train_predictions, color='blue', s=20, alpha=0.5, label='Train')
ax_joint.scatter(y_test, stacking_predictions, color='red', s=20, alpha=0.5, label='Test')

# 添加直方图和 KDE 曲线
sns.distplot(y_train, color='blue', ax=ax_marg_x, bins=25, hist_kws={"alpha": 0.6, "color": "deepskyblue"}, kde_kws={"color": "blue", "lw": 2, "label": "KDE"})
sns.distplot(y_test, color='red', ax=ax_marg_x, bins=25, hist_kws={"alpha": 0.6, "color": "salmon"}, kde_kws={"color": "red", "lw": 2})

sns.distplot(stacking_train_predictions, color='blue', ax=ax_marg_y, bins=25, vertical=True, hist_kws={"alpha": 0.6, "color": "deepskyblue"}, kde_kws={"color": "blue", "lw": 2})
sns.distplot(stacking_predictions, color='red', ax=ax_marg_y, bins=25, vertical=True, hist_kws={"alpha": 0.6, "color": "salmon"}, kde_kws={"color": "red", "lw": 2})

# 去掉直方图的背景和坐标
ax_marg_x.axis('off')
ax_marg_y.axis('off')

# 添加一条对角线，代表完美预测
ax_joint.plot([min(y_train.min(), y_test.min()), max(y_train.max(), y_test.max())], 
         [min(y_train.min(), y_test.min()), max(y_train.max(), y_test.max())], 
         color='gray', linestyle='--')

# 增加网格线
ax_joint.grid(True, linestyle='--', alpha=0.6)

# 添加图例
ax_joint.legend()

# 添加轴标签
ax_joint.set_xlabel('True Values', fontsize=14)
ax_joint.set_ylabel('Predictions', fontsize=14)

# 设置标题
#ax_joint.set_title('Stacking Model Predictions vs True Values', fontsize=16)

# 设置边界和间距
plt.tight_layout()

# 显示图表
plt.show()


In [None]:
###RF
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression

# 定义基础模型
base_learners = [
    ('rf', RandomForestRegressor()),
    ('xgb', XGBRegressor(tree_method='gpu_hist', 
                         subsample=1, 
                         n_estimators=500, 
                         min_child_weight=5, 
                         max_depth=10, 
                         learning_rate=0.01, 
                         reg_lambda=0.1, 
                         colsample_bytree=0.5, 
                         reg_alpha=0)),
    ('knn', KNeighborsRegressor(metric='manhattan', 
                                n_neighbors=5, 
                                weights='distance')),
    ('lgbm', LGBMRegressor())
]

# 定义元模型
meta_learner = RandomForestRegressor()

# 定义堆叠模型
stacking_model = StackingRegressor(estimators=base_learners, 
                                   final_estimator=meta_learner)


# 拟合数据
stacking_model.fit(X_train, y_train)

# 预测
stacking_train_predictions = stacking_model.predict(X_train)
stacking_predictions = stacking_model.predict(X_test)



# Calculate and print R^2 score
stacking_train_r2 = r2_score(y_train, stacking_train_predictions)
stacking_test_r2 = r2_score(y_test, stacking_predictions)
print("Train R^2 score: ", stacking_train_r2)
print("Test R^2 score: ", stacking_test_r2)

# Calculate and print RMSE
stacking_rmse_train = np.sqrt(mean_squared_error(y_train, stacking_train_predictions))
stacking_rmse_test = np.sqrt(mean_squared_error(y_test, stacking_predictions))
print(f"Train RMSE: {stacking_rmse_train}")
print(f"Test RMSE: {stacking_rmse_test}")


In [8]:
with open("RF_stacking_model.pkl", "wb") as file:
    pickle.dump(stacking_model, file)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.gridspec import GridSpec

# 使用 seaborn 默认的主题
sns.set_theme()

# 创建一个新的图表，设置分辨率为 300 dpi
fig = plt.figure(figsize=(10, 10), dpi=1200)

# 创建网格
gs = GridSpec(4,4)

# 创建子图
ax_joint = fig.add_subplot(gs[1:4,0:3])
ax_marg_x = fig.add_subplot(gs[0,0:3])
ax_marg_y = fig.add_subplot(gs[1:4,3])

# 绘制散点图和直方图
ax_joint.scatter(y_train, stacking_train_predictions, color='blue', s=20, alpha=0.5, label='Train')
ax_joint.scatter(y_test, stacking_predictions, color='red', s=20, alpha=0.5, label='Test')

# 添加直方图和 KDE 曲线
sns.distplot(y_train, color='blue', ax=ax_marg_x, bins=25, hist_kws={"alpha": 0.6, "color": "deepskyblue"}, kde_kws={"color": "blue", "lw": 2, "label": "KDE"})
sns.distplot(y_test, color='red', ax=ax_marg_x, bins=25, hist_kws={"alpha": 0.6, "color": "salmon"}, kde_kws={"color": "red", "lw": 2})

sns.distplot(stacking_train_predictions, color='blue', ax=ax_marg_y, bins=25, vertical=True, hist_kws={"alpha": 0.6, "color": "deepskyblue"}, kde_kws={"color": "blue", "lw": 2})
sns.distplot(stacking_predictions, color='red', ax=ax_marg_y, bins=25, vertical=True, hist_kws={"alpha": 0.6, "color": "salmon"}, kde_kws={"color": "red", "lw": 2})

# 去掉直方图的背景和坐标
ax_marg_x.axis('off')
ax_marg_y.axis('off')

# 添加一条对角线，代表完美预测
ax_joint.plot([min(y_train.min(), y_test.min()), max(y_train.max(), y_test.max())], 
         [min(y_train.min(), y_test.min()), max(y_train.max(), y_test.max())], 
         color='gray', linestyle='--')

# 增加网格线
ax_joint.grid(True, linestyle='--', alpha=0.6)

# 添加图例
ax_joint.legend()

# 添加轴标签
ax_joint.set_xlabel('True Values', fontsize=14)
ax_joint.set_ylabel('Predictions', fontsize=14)

# 设置标题
#ax_joint.set_title('Stacking Model Predictions vs True Values', fontsize=16)

# 设置边界和间距
plt.tight_layout()

# 显示图表
plt.show()


In [10]:
###SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
# 定义基础模型
base_learners = [
    ('rf', RandomForestRegressor()),
    ('xgb', XGBRegressor(tree_method='gpu_hist', 
                         subsample=1, 
                         n_estimators=500, 
                         min_child_weight=5, 
                         max_depth=10, 
                         learning_rate=0.01, 
                         reg_lambda=0.1, 
                         colsample_bytree=0.5, 
                         reg_alpha=0)),
    ('knn', KNeighborsRegressor(metric='manhattan', 
                                n_neighbors=5, 
                                weights='distance')),
    ('lgbm', LGBMRegressor())
]

# 定义元模型
meta_learner = SVR()

# 定义堆叠模型
stacking_model = StackingRegressor(estimators=base_learners, 
                                   final_estimator=meta_learner)


# 拟合数据
stacking_model.fit(X_train, y_train)

# 预测
stacking_train_predictions = stacking_model.predict(X_train)
stacking_predictions = stacking_model.predict(X_test)



# Calculate and print R^2 score
stacking_train_r2 = r2_score(y_train, stacking_train_predictions)
stacking_test_r2 = r2_score(y_test, stacking_predictions)
print("Train R^2 score: ", stacking_train_r2)
print("Test R^2 score: ", stacking_test_r2)

# Calculate and print RMSE
stacking_rmse_train = np.sqrt(mean_squared_error(y_train, stacking_train_predictions))
stacking_rmse_test = np.sqrt(mean_squared_error(y_test, stacking_predictions))
print(f"Train RMSE: {stacking_rmse_train}")
print(f"Test RMSE: {stacking_rmse_test}")


Train R^2 score:  0.985408442323822
Test R^2 score:  0.7254579980907926
Train RMSE: 0.18018858020028933
Test RMSE: 0.7384167518938417


In [11]:
with open("SVR_stacking_model.pkl", "wb") as file:
    pickle.dump(stacking_model, file)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.gridspec import GridSpec

# 使用 seaborn 默认的主题
sns.set_theme()

# 创建一个新的图表，设置分辨率为 300 dpi
fig = plt.figure(figsize=(10, 10), dpi=1200)

# 创建网格
gs = GridSpec(4,4)

# 创建子图
ax_joint = fig.add_subplot(gs[1:4,0:3])
ax_marg_x = fig.add_subplot(gs[0,0:3])
ax_marg_y = fig.add_subplot(gs[1:4,3])

# 绘制散点图和直方图
ax_joint.scatter(y_train, stacking_train_predictions, color='blue', s=20, alpha=0.5, label='Train')
ax_joint.scatter(y_test, stacking_predictions, color='red', s=20, alpha=0.5, label='Test')

# 添加直方图和 KDE 曲线
sns.distplot(y_train, color='blue', ax=ax_marg_x, bins=25, hist_kws={"alpha": 0.6, "color": "deepskyblue"}, kde_kws={"color": "blue", "lw": 2, "label": "KDE"})
sns.distplot(y_test, color='red', ax=ax_marg_x, bins=25, hist_kws={"alpha": 0.6, "color": "salmon"}, kde_kws={"color": "red", "lw": 2})

sns.distplot(stacking_train_predictions, color='blue', ax=ax_marg_y, bins=25, vertical=True, hist_kws={"alpha": 0.6, "color": "deepskyblue"}, kde_kws={"color": "blue", "lw": 2})
sns.distplot(stacking_predictions, color='red', ax=ax_marg_y, bins=25, vertical=True, hist_kws={"alpha": 0.6, "color": "salmon"}, kde_kws={"color": "red", "lw": 2})

# 去掉直方图的背景和坐标
ax_marg_x.axis('off')
ax_marg_y.axis('off')

# 添加一条对角线，代表完美预测
ax_joint.plot([min(y_train.min(), y_test.min()), max(y_train.max(), y_test.max())], 
         [min(y_train.min(), y_test.min()), max(y_train.max(), y_test.max())], 
         color='gray', linestyle='--')

# 增加网格线
ax_joint.grid(True, linestyle='--', alpha=0.6)

# 添加图例
ax_joint.legend()

# 添加轴标签
ax_joint.set_xlabel('True Values', fontsize=14)
ax_joint.set_ylabel('Predictions', fontsize=14)

# 设置标题
#ax_joint.set_title('Stacking Model Predictions vs True Values', fontsize=16)

# 设置边界和间距
plt.tight_layout()

# 显示图表
plt.show()


In [None]:
###MLP
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
# 定义基础模型
base_learners = [
    ('rf', RandomForestRegressor()),
    ('xgb', XGBRegressor(tree_method='gpu_hist', 
                         subsample=1, 
                         n_estimators=500, 
                         min_child_weight=5, 
                         max_depth=10, 
                         learning_rate=0.01, 
                         reg_lambda=0.1, 
                         colsample_bytree=0.5, 
                         reg_alpha=0)),
    ('knn', KNeighborsRegressor(metric='manhattan', 
                                n_neighbors=5, 
                                weights='distance')),
    ('lgbm', LGBMRegressor())
]

# 定义元模型
meta_learner = MLPRegressor()

# 定义堆叠模型
stacking_model = StackingRegressor(estimators=base_learners, 
                                   final_estimator=meta_learner)


# 拟合数据
stacking_model.fit(X_train, y_train)

# 预测
stacking_train_predictions = stacking_model.predict(X_train)
stacking_predictions = stacking_model.predict(X_test)



# Calculate and print R^2 score
stacking_train_r2 = r2_score(y_train, stacking_train_predictions)
stacking_test_r2 = r2_score(y_test, stacking_predictions)
print("Train R^2 score: ", stacking_train_r2)
print("Test R^2 score: ", stacking_test_r2)

# Calculate and print RMSE
stacking_rmse_train = np.sqrt(mean_squared_error(y_train, stacking_train_predictions))
stacking_rmse_test = np.sqrt(mean_squared_error(y_test, stacking_predictions))
print(f"Train RMSE: {stacking_rmse_train}")
print(f"Test RMSE: {stacking_rmse_test}")


In [14]:
with open("MLP_stacking_model.pkl", "wb") as file:
    pickle.dump(stacking_model, file)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.gridspec import GridSpec

# 使用 seaborn 默认的主题
sns.set_theme()

# 创建一个新的图表，设置分辨率为 300 dpi
fig = plt.figure(figsize=(10, 10), dpi=1200)

# 创建网格
gs = GridSpec(4,4)

# 创建子图
ax_joint = fig.add_subplot(gs[1:4,0:3])
ax_marg_x = fig.add_subplot(gs[0,0:3])
ax_marg_y = fig.add_subplot(gs[1:4,3])

# 绘制散点图和直方图
ax_joint.scatter(y_train, stacking_train_predictions, color='blue', s=20, alpha=0.5, label='Train')
ax_joint.scatter(y_test, stacking_predictions, color='red', s=20, alpha=0.5, label='Test')

# 添加直方图和 KDE 曲线
sns.distplot(y_train, color='blue', ax=ax_marg_x, bins=25, hist_kws={"alpha": 0.6, "color": "deepskyblue"}, kde_kws={"color": "blue", "lw": 2, "label": "KDE"})
sns.distplot(y_test, color='red', ax=ax_marg_x, bins=25, hist_kws={"alpha": 0.6, "color": "salmon"}, kde_kws={"color": "red", "lw": 2})

sns.distplot(stacking_train_predictions, color='blue', ax=ax_marg_y, bins=25, vertical=True, hist_kws={"alpha": 0.6, "color": "deepskyblue"}, kde_kws={"color": "blue", "lw": 2})
sns.distplot(stacking_predictions, color='red', ax=ax_marg_y, bins=25, vertical=True, hist_kws={"alpha": 0.6, "color": "salmon"}, kde_kws={"color": "red", "lw": 2})

# 去掉直方图的背景和坐标
ax_marg_x.axis('off')
ax_marg_y.axis('off')

# 添加一条对角线，代表完美预测
ax_joint.plot([min(y_train.min(), y_test.min()), max(y_train.max(), y_test.max())], 
         [min(y_train.min(), y_test.min()), max(y_train.max(), y_test.max())], 
         color='gray', linestyle='--')

# 增加网格线
ax_joint.grid(True, linestyle='--', alpha=0.6)

# 添加图例
ax_joint.legend()

# 添加轴标签
ax_joint.set_xlabel('True Values', fontsize=14)
ax_joint.set_ylabel('Predictions', fontsize=14)

# 设置标题
#ax_joint.set_title('Stacking Model Predictions vs True Values', fontsize=16)

# 设置边界和间距
plt.tight_layout()

# 显示图表
plt.show()


In [23]:
###XGB
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
# 定义基础模型
base_learners = [
    ('rf', RandomForestRegressor()),
    ('xgb', XGBRegressor(tree_method='gpu_hist', 
                         subsample=1, 
                         n_estimators=500, 
                         min_child_weight=5, 
                         max_depth=10, 
                         learning_rate=0.01, 
                         reg_lambda=0.1, 
                         colsample_bytree=0.5, 
                         reg_alpha=0)),
    ('knn', KNeighborsRegressor(metric='manhattan', 
                                n_neighbors=5, 
                                weights='distance')),
    ('lgbm', LGBMRegressor())
]

# 定义元模型
meta_learner = XGBRegressor(tree_method='gpu_hist', 
                         subsample=1, 
                         n_estimators=500, 
                         min_child_weight=5, 
                         max_depth=10, 
                         learning_rate=0.01, 
                         reg_lambda=0.1, 
                         colsample_bytree=0.5, 
                         reg_alpha=0)

# 定义堆叠模型
stacking_model = StackingRegressor(estimators=base_learners, 
                                   final_estimator=meta_learner)


# 拟合数据
stacking_model.fit(X_train, y_train)

# 预测
stacking_train_predictions = stacking_model.predict(X_train)
stacking_predictions = stacking_model.predict(X_test)



# Calculate and print R^2 score
stacking_train_r2 = r2_score(y_train, stacking_train_predictions)
stacking_test_r2 = r2_score(y_test, stacking_predictions)
print("Train R^2 score: ", stacking_train_r2)
print("Test R^2 score: ", stacking_test_r2)

# Calculate and print RMSE
stacking_rmse_train = np.sqrt(mean_squared_error(y_train, stacking_train_predictions))
stacking_rmse_test = np.sqrt(mean_squared_error(y_test, stacking_predictions))
print(f"Train RMSE: {stacking_rmse_train}")
print(f"Test RMSE: {stacking_rmse_test}")


Train R^2 score:  0.9520149472978143
Test R^2 score:  0.6914181204509753
Train RMSE: 0.32676040319168265
Test RMSE: 0.782856843757578


In [24]:
with open("XGB_stacking_model.pkl", "wb") as file:
    pickle.dump(stacking_model, file)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.gridspec import GridSpec

# 使用 seaborn 默认的主题
sns.set_theme()

# 创建一个新的图表，设置分辨率为 300 dpi
fig = plt.figure(figsize=(10, 10), dpi=1200)

# 创建网格
gs = GridSpec(4,4)

# 创建子图
ax_joint = fig.add_subplot(gs[1:4,0:3])
ax_marg_x = fig.add_subplot(gs[0,0:3])
ax_marg_y = fig.add_subplot(gs[1:4,3])

# 绘制散点图和直方图
ax_joint.scatter(y_train, stacking_train_predictions, color='blue', s=20, alpha=0.5, label='Train')
ax_joint.scatter(y_test, stacking_predictions, color='red', s=20, alpha=0.5, label='Test')

# 添加直方图和 KDE 曲线
sns.distplot(y_train, color='blue', ax=ax_marg_x, bins=25, hist_kws={"alpha": 0.6, "color": "deepskyblue"}, kde_kws={"color": "blue", "lw": 2, "label": "KDE"})
sns.distplot(y_test, color='red', ax=ax_marg_x, bins=25, hist_kws={"alpha": 0.6, "color": "salmon"}, kde_kws={"color": "red", "lw": 2})

sns.distplot(stacking_train_predictions, color='blue', ax=ax_marg_y, bins=25, vertical=True, hist_kws={"alpha": 0.6, "color": "deepskyblue"}, kde_kws={"color": "blue", "lw": 2})
sns.distplot(stacking_predictions, color='red', ax=ax_marg_y, bins=25, vertical=True, hist_kws={"alpha": 0.6, "color": "salmon"}, kde_kws={"color": "red", "lw": 2})

# 去掉直方图的背景和坐标
ax_marg_x.axis('off')
ax_marg_y.axis('off')

# 添加一条对角线，代表完美预测
ax_joint.plot([min(y_train.min(), y_test.min()), max(y_train.max(), y_test.max())], 
         [min(y_train.min(), y_test.min()), max(y_train.max(), y_test.max())], 
         color='gray', linestyle='--')

# 增加网格线
ax_joint.grid(True, linestyle='--', alpha=0.6)

# 添加图例
ax_joint.legend()

# 添加轴标签
ax_joint.set_xlabel('True Values', fontsize=14)
ax_joint.set_ylabel('Predictions', fontsize=14)

# 设置标题
#ax_joint.set_title('Stacking Model Predictions vs True Values', fontsize=16)

# 设置边界和间距
plt.tight_layout()

# 显示图表
plt.show()


In [26]:
###KNN
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
# 定义基础模型
base_learners = [
    ('rf', RandomForestRegressor()),
    ('xgb', XGBRegressor(tree_method='gpu_hist', 
                         subsample=1, 
                         n_estimators=500, 
                         min_child_weight=5, 
                         max_depth=10, 
                         learning_rate=0.01, 
                         reg_lambda=0.1, 
                         colsample_bytree=0.5, 
                         reg_alpha=0)),
    ('knn', KNeighborsRegressor(metric='manhattan', 
                                n_neighbors=5, 
                                weights='distance')),
    ('lgbm', LGBMRegressor())
]

# 定义元模型
meta_learner = KNeighborsRegressor(metric='manhattan', 
                                n_neighbors=5, 
                                weights='distance')

# 定义堆叠模型
stacking_model = StackingRegressor(estimators=base_learners, 
                                   final_estimator=meta_learner)


# 拟合数据
stacking_model.fit(X_train, y_train)

# 预测
stacking_train_predictions = stacking_model.predict(X_train)
stacking_predictions = stacking_model.predict(X_test)



# Calculate and print R^2 score
stacking_train_r2 = r2_score(y_train, stacking_train_predictions)
stacking_test_r2 = r2_score(y_test, stacking_predictions)
print("Train R^2 score: ", stacking_train_r2)
print("Test R^2 score: ", stacking_test_r2)

# Calculate and print RMSE
stacking_rmse_train = np.sqrt(mean_squared_error(y_train, stacking_train_predictions))
stacking_rmse_test = np.sqrt(mean_squared_error(y_test, stacking_predictions))
print(f"Train RMSE: {stacking_rmse_train}")
print(f"Test RMSE: {stacking_rmse_test}")


Train R^2 score:  0.9291367693626342
Test R^2 score:  0.6829905543921403
Train RMSE: 0.39708813370641666
Test RMSE: 0.7934749924729837


In [27]:
with open("KNN_stacking_model.pkl", "wb") as file:
    pickle.dump(stacking_model, file)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.gridspec import GridSpec

# 使用 seaborn 默认的主题
sns.set_theme()

# 创建一个新的图表，设置分辨率为 300 dpi
fig = plt.figure(figsize=(10, 10), dpi=1200)

# 创建网格
gs = GridSpec(4,4)

# 创建子图
ax_joint = fig.add_subplot(gs[1:4,0:3])
ax_marg_x = fig.add_subplot(gs[0,0:3])
ax_marg_y = fig.add_subplot(gs[1:4,3])

# 绘制散点图和直方图
ax_joint.scatter(y_train, stacking_train_predictions, color='blue', s=20, alpha=0.5, label='Train')
ax_joint.scatter(y_test, stacking_predictions, color='red', s=20, alpha=0.5, label='Test')

# 添加直方图和 KDE 曲线
sns.distplot(y_train, color='blue', ax=ax_marg_x, bins=25, hist_kws={"alpha": 0.6, "color": "deepskyblue"}, kde_kws={"color": "blue", "lw": 2, "label": "KDE"})
sns.distplot(y_test, color='red', ax=ax_marg_x, bins=25, hist_kws={"alpha": 0.6, "color": "salmon"}, kde_kws={"color": "red", "lw": 2})

sns.distplot(stacking_train_predictions, color='blue', ax=ax_marg_y, bins=25, vertical=True, hist_kws={"alpha": 0.6, "color": "deepskyblue"}, kde_kws={"color": "blue", "lw": 2})
sns.distplot(stacking_predictions, color='red', ax=ax_marg_y, bins=25, vertical=True, hist_kws={"alpha": 0.6, "color": "salmon"}, kde_kws={"color": "red", "lw": 2})

# 去掉直方图的背景和坐标
ax_marg_x.axis('off')
ax_marg_y.axis('off')

# 添加一条对角线，代表完美预测
ax_joint.plot([min(y_train.min(), y_test.min()), max(y_train.max(), y_test.max())], 
         [min(y_train.min(), y_test.min()), max(y_train.max(), y_test.max())], 
         color='gray', linestyle='--')

# 增加网格线
ax_joint.grid(True, linestyle='--', alpha=0.6)

# 添加图例
ax_joint.legend()

# 添加轴标签
ax_joint.set_xlabel('True Values', fontsize=14)
ax_joint.set_ylabel('Predictions', fontsize=14)

# 设置标题
#ax_joint.set_title('Stacking Model Predictions vs True Values', fontsize=16)

# 设置边界和间距
plt.tight_layout()

# 显示图表
plt.show()


In [None]:
###KNN
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
# 定义基础模型
base_learners = [
    ('rf', RandomForestRegressor()),
    ('xgb', XGBRegressor(tree_method='gpu_hist', 
                         subsample=1, 
                         n_estimators=500, 
                         min_child_weight=5, 
                         max_depth=10, 
                         learning_rate=0.01, 
                         reg_lambda=0.1, 
                         colsample_bytree=0.5, 
                         reg_alpha=0)),
    ('knn', KNeighborsRegressor(metric='manhattan', 
                                n_neighbors=5, 
                                weights='distance')),
    ('lgbm', LGBMRegressor())
]

# 定义元模型
meta_learner = KNeighborsRegressor(metric='manhattan', 
                                n_neighbors=5, 
                                weights='distance')

# 定义堆叠模型
stacking_model = StackingRegressor(estimators=base_learners, 
                                   final_estimator=meta_learner)


# 拟合数据
stacking_model.fit(X_train, y_train)

# 预测
stacking_train_predictions = stacking_model.predict(X_train)
stacking_predictions = stacking_model.predict(X_test)



# Calculate and print R^2 score
stacking_train_r2 = r2_score(y_train, stacking_train_predictions)
stacking_test_r2 = r2_score(y_test, stacking_predictions)
print("Train R^2 score: ", stacking_train_r2)
print("Test R^2 score: ", stacking_test_r2)

# Calculate and print RMSE
stacking_rmse_train = np.sqrt(mean_squared_error(y_train, stacking_train_predictions))
stacking_rmse_test = np.sqrt(mean_squared_error(y_test, stacking_predictions))
print(f"Train RMSE: {stacking_rmse_train}")
print(f"Test RMSE: {stacking_rmse_test}")
###KNN
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
# 定义基础模型
base_learners = [
    ('rf', RandomForestRegressor()),
    ('xgb', XGBRegressor(tree_method='gpu_hist', 
                         subsample=1, 
                         n_estimators=500, 
                         min_child_weight=5, 
                         max_depth=10, 
                         learning_rate=0.01, 
                         reg_lambda=0.1, 
                         colsample_bytree=0.5, 
                         reg_alpha=0)),
    ('knn', KNeighborsRegressor(metric='manhattan', 
                                n_neighbors=5, 
                                weights='distance')),
    ('lgbm', LGBMRegressor())
]

# 定义元模型
meta_learner = KNeighborsRegressor(metric='manhattan', 
                                n_neighbors=5, 
                                weights='distance')

# 定义堆叠模型
stacking_model = StackingRegressor(estimators=base_learners, 
                                   final_estimator=meta_learner)


# 拟合数据
stacking_model.fit(X_train, y_train)

# 预测
stacking_train_predictions = stacking_model.predict(X_train)
stacking_predictions = stacking_model.predict(X_test)



# Calculate and print R^2 score
stacking_train_r2 = r2_score(y_train, stacking_train_predictions)
stacking_test_r2 = r2_score(y_test, stacking_predictions)
print("Train R^2 score: ", stacking_train_r2)
print("Test R^2 score: ", stacking_test_r2)

# Calculate and print RMSE
stacking_rmse_train = np.sqrt(mean_squared_error(y_train, stacking_train_predictions))
stacking_rmse_test = np.sqrt(mean_squared_error(y_test, stacking_predictions))
print(f"Train RMSE: {stacking_rmse_train}")
print(f"Test RMSE: {stacking_rmse_test}")
