In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score

from xgboost import XGBRegressor
from sklearn.linear_model import Lasso
from sklearn.kernel_ridge import KernelRidge

from skopt import gp_minimize
from skopt.space import Real

import os
os.environ['PYTHONHASHSEED'] = '42'

# Output setup
output_dir = "TEST_Excel_Ensemble_Weight_Opt_Results"
os.makedirs(output_dir, exist_ok=True)

# Load data
FEATURE_COLUMNS = [
    "C", "H", "O", "N", "F", "S", "System_Size", "a", "b", "c",
    "alpha", "beta", "gamma", "density", "PLD", "LCD", "N2_SA",
    "Probe_Accessible", "Probe_Occupiable", "Rosenbluth_Weight"
]
features = pd.read_csv("PIM_ExpFeatures.csv")[FEATURE_COLUMNS].values
labels = pd.read_csv("PIM_Qst_Labels.csv")["Qst_CO2_298K"].values

# Normalize features
scaler = MinMaxScaler()
features = scaler.fit_transform(features)

# Best hyperparameters from previous Bayes optimization
xgb_model = XGBRegressor(
    n_estimators=109,
    max_depth=18,
    learning_rate=0.179388327,
    subsample=0.5,
    colsample_bytree=1,
    objective="reg:squarederror",
    n_jobs=1,
    tree_method="exact",
    verbosity=0,
    random_state=42,
    enable_categorical=False
)

lasso_model = Lasso(alpha=0.000678645)

krr_model = KernelRidge(kernel="rbf", alpha=1.0e-6, gamma=0.000483072)

# Reparameterized search space: w1 in [0,1], w2 in [0,1 - w1]
space = [Real(0.0, 1.0, name="w1"), Real(0.0, 1.0, name="w2")]

# Objective function
def objective(params):
    w1, w2 = params
    if w1 + w2 > 1.0:
        return 1e6  # Penalize infeasible region

    w3 = 1.0 - w1 - w2
    r2_scores = []

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    for train_idx, test_idx in kf.split(features):
        X_train, X_test = features[train_idx], features[test_idx]
        y_train, y_test = labels[train_idx], labels[test_idx]

        xgb_model.fit(X_train, y_train)
        lasso_model.fit(X_train, y_train)
        krr_model.fit(X_train, y_train)

        preds = (
            w1 * xgb_model.predict(X_test) +
            w2 * lasso_model.predict(X_test) +
            w3 * krr_model.predict(X_test)
        )

        r2_scores.append(r2_score(y_test, preds))

    return -np.mean(r2_scores)  # Minimize negative R²

# Run Bayesian optimization
result = gp_minimize(
    func=objective,
    dimensions=space,
    n_calls=1500,
    n_initial_points=250,
    random_state=42,
    verbose=True
)

# Extract best weights
best_w1, best_w2 = result.x
best_w3 = 1.0 - best_w1 - best_w2
best_weights = {"XGBoost": best_w1, "Lasso": best_w2, "RBF_KRR": best_w3}
pd.DataFrame([best_weights]).to_csv(os.path.join(output_dir, "ensemble_best_weights.csv"), index=False)

# Evaluate ensemble with best weights
train_r2s, test_r2s = [], []
all_train_actuals, all_train_preds = [], []
all_test_actuals, all_test_preds = [], []

kf = KFold(n_splits=5, shuffle=True, random_state=42)
for train_idx, test_idx in kf.split(features):
    X_train, X_test = features[train_idx], features[test_idx]
    y_train, y_test = labels[train_idx], labels[test_idx]

    xgb_model.fit(X_train, y_train)
    lasso_model.fit(X_train, y_train)
    krr_model.fit(X_train, y_train)

    train_preds = (
        best_w1 * xgb_model.predict(X_train) +
        best_w2 * lasso_model.predict(X_train) +
        best_w3 * krr_model.predict(X_train)
    )
    test_preds = (
        best_w1 * xgb_model.predict(X_test) +
        best_w2 * lasso_model.predict(X_test) +
        best_w3 * krr_model.predict(X_test)
    )

    all_train_actuals.extend(y_train)
    all_train_preds.extend(train_preds)
    all_test_actuals.extend(y_test)
    all_test_preds.extend(test_preds)

    train_r2s.append(r2_score(y_train, train_preds))
    test_r2s.append(r2_score(y_test, test_preds))

# Save results
cv_df = pd.DataFrame({
    "Fold": range(1, 6),
    "Training R^2": train_r2s,
    "Testing R^2": test_r2s
})
cv_df.to_csv(os.path.join(output_dir, "ensemble_cv_results.csv"), index=False)

# Parity plots
#for label, actual, pred, r2, fname in zip(
#    ["Training", "Testing"],
#    [all_train_actuals, all_test_actuals],
#    [all_train_preds, all_test_preds],
#    [np.mean(train_r2s), np.mean(test_r2s)],
#    ["ensemble_training_parity_plot.png", "ensemble_testing_parity_plot.png"]
#):
#    plt.figure()
#    plt.scatter(actual, pred, alpha=0.7)
#    plt.plot([min(actual), max(actual)], [min(actual), max(actual)], 'k--')
#    plt.xlabel("Actual")
#    plt.ylabel("Predicted")
#    plt.title(f"{label} Parity Plot ($R^2 = {r2:.3f}$)")
#    plt.grid()
#    plt.savefig(os.path.join(output_dir, fname))
#    plt.close()

#print(f"✅ Ensemble optimization results saved in {output_dir}")


Iteration No: 1 started. Evaluating function at random point.
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 0.0838
Function value obtained: -0.9292
Current minimum: -0.9292
Iteration No: 2 started. Evaluating function at random point.
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 0.0001
Function value obtained: 1000000.0000
Current minimum: -0.9292
Iteration No: 3 started. Evaluating function at random point.
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 0.0818
Function value obtained: -0.9545
Current minimum: -0.9545
Iteration No: 4 started. Evaluating function at random point.
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 0.0835
Function value obtained: -0.9524
Current minimum: -0.9545
Iteration No: 5 started. Evaluating function at random point.
Iteration No: 5 ended. Evaluation done at random point.
Time taken: 0.0840
Function value obtained: -0.9606
Current minimum: -0.9606
Iteration No: 6 star

In [None]:
# Final parity plot with both training and testing results
plt.figure(figsize=(6, 6))
plt.rcParams.update({'font.family': 'Times New Roman'})

# Scatter points
plt.scatter(all_train_actuals, all_train_preds, color='blue', alpha=0.7, edgecolor='k', label=f"Training ($R^2$ = {np.mean(train_r2s):.3f})")
plt.scatter(all_test_actuals, all_test_preds, color='red', alpha=0.7, edgecolor='k', label=f"Testing ($R^2$ = {np.mean(test_r2s):.3f})")

# Diagonal reference line
min_val = min(min(all_train_actuals), min(all_test_actuals))
max_val = max(max(all_train_actuals), max(all_test_actuals))
plt.plot([min_val, max_val], [min_val, max_val], 'k--', linewidth=1)

# Labels and title
plt.xlabel("Actual Qst", fontsize=14)
plt.ylabel("Predicted Qst", fontsize=14)
plt.title("Weighted Ensemble Prediction Results", fontsize=14)

# Styling
plt.xticks(fontsize=12, color='black')
plt.yticks(fontsize=12, color='black')
plt.grid(True, linestyle='--', linewidth=0.5)
plt.legend(loc='upper left', fontsize=12, frameon=False)
plt.gca().set_aspect('equal', adjustable='box')
plt.tight_layout()

# Save as PDF
plt.savefig(os.path.join(output_dir, "Ensemble_combined_parity_plot.pdf"), format='pdf')
plt.close()
