In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from skopt import gp_minimize
from skopt.space import Integer, Real
from sklearn.preprocessing import MinMaxScaler
import shap

# Output directory
output_dir = "Excel_CatBoost_Results"
os.makedirs(output_dir, exist_ok=True)

# Load feature data
FEATURE_COLUMNS = [
    "C", "H", "O", "N", "F", "S", "System_Size", "a", "b", "c",
    "alpha", "beta", "gamma", "density", "PLD", "LCD", "N2_SA",
    "Probe_Accessible", "Probe_Occupiable", "Rosenbluth_Weight"
]
data = pd.read_csv("PIM_ExpFeatures.csv")
features = data[FEATURE_COLUMNS].values

# Normalize descriptors
scaler = MinMaxScaler()
features = scaler.fit_transform(features)

# Load target values
labels = pd.read_csv("PIM_Qst_Labels.csv")["Qst_CO2_298K"].values

# Define Bayesian Optimization search space
#space = [
#    Integer(100, 500, name="iterations"),
#    Integer(3, 10, name="depth"),
#    Real(0.01, 0.3, name="learning_rate"),
#    Real(1, 10, name="l2_leaf_reg")
#]

# Objective function
#def objective(params):
#    iterations, depth, learning_rate, l2_leaf_reg = params
#    r2_scores = []

#    kf = KFold(n_splits=5, shuffle=True, random_state=42)
#    for train_idx, test_idx in kf.split(features):
#        X_train, X_test = features[train_idx], features[test_idx]
#        y_train, y_test = labels[train_idx], labels[test_idx]
#
#        model = CatBoostRegressor(
#            iterations=iterations,
#            depth=depth,
#            learning_rate=learning_rate,
#            l2_leaf_reg=l2_leaf_reg,
#            loss_function="RMSE",
#            verbose=0,
#            random_state=42
#        )
#        model.fit(X_train, y_train)
#        preds = model.predict(X_test)
#        r2_scores.append(r2_score(y_test, preds))

#    return -np.mean(r2_scores)

# Run Bayesian optimization
#result = gp_minimize(
#    objective,
#    space,
#    n_calls=500,
#    n_initial_points=50,
#    random_state=42,
#    verbose=True
#)

# Extract best parameters
#best_iterations, best_depth, best_learning_rate, best_l2 = result.x
#best_params = pd.DataFrame({
#    "iterations": [best_iterations],
#    "depth": [best_depth],
#    "learning_rate": [best_learning_rate],
#    "l2_leaf_reg": [best_l2]
#})
#print(best_params)
#best_params.to_csv(os.path.join(output_dir, "CatBoost_best_hyperparameters.csv"), index=False)

# Evaluate final model with 5-fold CV
kf = KFold(n_splits=5, shuffle=True, random_state=42)
training_r2, testing_r2 = [], []
train_actuals, train_preds = [], []
test_actuals, test_preds = [], []

for train_idx, test_idx in kf.split(features):
    X_train, X_test = features[train_idx], features[test_idx]
    y_train, y_test = labels[train_idx], labels[test_idx]

    model = CatBoostRegressor(
        iterations=214,
        depth=4,
        learning_rate=0.15229589,
        l2_leaf_reg=1,
        loss_function="RMSE",
        verbose=0,
        random_state=42
    )
    model.fit(X_train, y_train)

    train_p = model.predict(X_train)
    test_p = model.predict(X_test)

    training_r2.append(r2_score(y_train, train_p))
    testing_r2.append(r2_score(y_test, test_p))
    train_actuals.extend(y_train)
    train_preds.extend(train_p)
    test_actuals.extend(y_test)
    test_preds.extend(test_p)

# Save CV results
cv_results = pd.DataFrame({
    "Fold": range(1, 6),
    "Training R^2": training_r2,
    "Testing R^2": testing_r2
})
print(cv_results)
#cv_results.to_csv(os.path.join(output_dir, "CatBoost_cv_results.csv"), index=False)

  from .autonotebook import tqdm as notebook_tqdm


   Fold  Training R^2  Testing R^2
0     1      0.999992     0.944481
1     2      0.999996     0.907295
2     3      0.999995     0.930099
3     4      0.999994     0.896466
4     5      0.999995     0.780388


In [2]:
# Retrain best CatBoost model on full dataset
final_model = CatBoostRegressor(
    iterations=214,
    depth=4,
    learning_rate=0.15229589,
    l2_leaf_reg=1,
    loss_function="RMSE",
    verbose=0,
    random_state=42
)
final_model.fit(features, labels)

# SHAP analysis
explainer = shap.Explainer(final_model)
shap_values = explainer(features)

# Save SHAP values (one row per sample, one column per feature)
shap_df = pd.DataFrame(shap_values.values, columns=FEATURE_COLUMNS)
shap_df["Sample_Index"] = np.arange(len(shap_df))
shap_df.to_csv(os.path.join(output_dir, "CatBoost_SHAP_values.csv"), index=False)

# Compute and save feature ranking (mean absolute SHAP)
mean_abs_shap = np.abs(shap_df[FEATURE_COLUMNS]).mean().sort_values(ascending=False)
mean_abs_shap_df = mean_abs_shap.reset_index()
mean_abs_shap_df.columns = ["Feature", "Mean_Absolute_SHAP"]
mean_abs_shap_df.to_csv(os.path.join(output_dir, "CatBoost_SHAP_feature_ranking.csv"), index=False)

print("✅ SHAP analysis completed and saved.")

✅ SHAP analysis completed and saved.


In [3]:
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl

# Set global font to Times New Roman and base font size to 14
mpl.rcParams['font.family'] = 'Times New Roman'
mpl.rcParams['font.size'] = 14

# Load SHAP feature ranking
shap_ranking_path = os.path.join(output_dir, "CatBoost_SHAP_feature_ranking.csv")
shap_ranking = pd.read_csv(shap_ranking_path)

# Plot: Top 10 SHAP features
plt.figure(figsize=(8, 6))
barplot = sns.barplot(
    data=shap_ranking.head(10),
    x="Mean_Absolute_SHAP",
    y="Feature",
    orient="h",
    palette="viridis"
)

# Adjust axis labels and tick label sizes
barplot.set_title("Top 10 Feature Importances by SHAP (CatBoost)", fontsize=14)
barplot.set_xlabel("Mean Absolute SHAP Value", fontsize=14)
barplot.set_ylabel("Feature", fontsize=14)
barplot.tick_params(axis='x', labelsize=12)
barplot.tick_params(axis='y', labelsize=14)
plt.grid(True, axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()

# Save the figure
plot_path = os.path.join(output_dir, "CatBoost_SHAP_feature_importance_plot.png")
plt.savefig(plot_path, dpi=300)
plt.close()

print(f"✅ SHAP feature importance plot saved to: {plot_path}")


✅ SHAP feature importance plot saved to: Excel_CatBoost_Results/CatBoost_SHAP_feature_importance_plot.png



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  barplot = sns.barplot(


In [4]:
# Parity Plots
plt.figure(figsize=(6, 6))
plt.rcParams.update({'font.family': 'Times New Roman'})

# Scatter points
plt.scatter(train_actuals, train_preds, color='blue', alpha=0.7, edgecolor='k', label=f"Training ($R^2$ = {np.mean(training_r2):.3f})")
plt.scatter(test_actuals, test_preds, color='red', alpha=0.7, edgecolor='k', label=f"Testing ($R^2$ = {np.mean(testing_r2):.3f})")

# Diagonal reference line
min_val = min(min(train_actuals), min(test_actuals))
max_val = max(max(train_actuals), max(test_actuals))
plt.plot([min_val, max_val], [min_val, max_val], 'k--', linewidth=1)

# Labels and title
plt.xlabel("Actual Qst", fontsize=14)
plt.ylabel("Predicted Qst", fontsize=14)
plt.title("CatBoost Prediction Results", fontsize=14)

# Styling
plt.xticks(fontsize=12, color='black')
plt.yticks(fontsize=12, color='black')
plt.grid(True, linestyle='--', linewidth=0.5)
plt.legend(loc='upper left', fontsize=12, frameon=False)
plt.gca().set_aspect('equal', adjustable='box')
plt.tight_layout()

# Save as PDF
plt.savefig(os.path.join(output_dir, "CatBoost_combined_parity_plot.pdf"), format='pdf')
plt.close()
