In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from skopt import gp_minimize
from skopt.space import Real, Integer
from sklearn.preprocessing import MinMaxScaler

# Create an output directory to store results
output_dir = "Excel_Polynomial_Results"
os.makedirs(output_dir, exist_ok=True)

# Define the feature column labels for PIMs descriptors
FEATURE_COLUMNS = [
    "C", "H", "O", "N", "F", "S", "System_Size", "a", "b", "c",
    "alpha", "beta", "gamma", "density", "PLD", "LCD", "N2_SA",
    "Probe_Accessible", "Probe_Occupiable", "Rosenbluth_Weight"
]

# Load and preprocess the PIMs descriptors
data = pd.read_csv("PIM_ExpFeatures.csv")
features = data[FEATURE_COLUMNS].values

# Normalize PIMs descriptors
scaler = MinMaxScaler()
features = scaler.fit_transform(features)

# Load the Qst labels
labels_data = pd.read_csv("PIM_Qst_Labels.csv")
labels = labels_data["Qst_CO2_298K"].values  # Extract Qst labels

# Define hyperparameter search space for Bayesian Optimization (Polynomial Kernel)
space = [
    Real(1e-6, 1e2, "log-uniform", name="alpha"),  # Regularization parameter
    Integer(1, 5, name="degree"),  # Polynomial degree
    Real(1e-6, 1e2, "log-uniform", name="coef0")  # Coefficient term in polynomial kernel
]

# Objective function for Bayesian Optimization
def objective(params):
    alpha, degree, coef0 = params
    testing_r2_scores = []

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    for train_idx, test_idx in kf.split(features):
        X_train, X_test = features[train_idx], features[test_idx]
        y_train, y_test = labels[train_idx], labels[test_idx]

        model = KernelRidge(kernel="poly", alpha=alpha, degree=degree, coef0=coef0)
        model.fit(X_train, y_train)
        test_predictions = model.predict(X_test)
        testing_r2_scores.append(r2_score(y_test, test_predictions))

    return -np.mean(testing_r2_scores)  # Negative because skopt minimizes

# Run Bayesian Optimization
result = gp_minimize(
    objective,
    space,
    n_calls=500,
    n_initial_points=50,
    random_state=42,
    verbose=True  # Display progress
)

# Extract the best hyperparameters
best_alpha, best_degree, best_coef0 = result.x

# Save best hyperparameters to a CSV file
best_params = pd.DataFrame({"Alpha": [best_alpha], "Degree": [best_degree], "Coef0": [best_coef0]})
#best_params.to_csv(os.path.join(output_dir, "Excel_Polynomial_best_hyperparameters.csv"), index=False)

# Evaluate the best model
kf = KFold(n_splits=5, shuffle=True, random_state=42)
training_r2_scores = []
testing_r2_scores = []
all_train_actuals, all_train_preds = [], []
all_test_actuals, all_test_preds = [], []

for train_idx, test_idx in kf.split(features):
    X_train, X_test = features[train_idx], features[test_idx]
    y_train, y_test = labels[train_idx], labels[test_idx]

    model = KernelRidge(kernel="poly", alpha=best_alpha, degree=best_degree, coef0=best_coef0)
    model.fit(X_train, y_train)

    train_predictions = model.predict(X_train)
    test_predictions = model.predict(X_test)

    all_train_actuals.extend(y_train)
    all_train_preds.extend(train_predictions)
    all_test_actuals.extend(y_test)
    all_test_preds.extend(test_predictions)

    training_r2_scores.append(r2_score(y_train, train_predictions))
    testing_r2_scores.append(r2_score(y_test, test_predictions))

# Save cross-validation R² scores
#cv_results = pd.DataFrame({
#    "Fold": range(1, len(training_r2_scores) + 1),
#    "Training R^2": training_r2_scores,
#    "Testing R^2": testing_r2_scores
#})
#cv_results.to_csv(os.path.join(output_dir, "Excel_Polynomial_cv_results.csv"), index=False)

# Compute average R²
average_training_r2 = np.mean(training_r2_scores)
average_testing_r2 = np.mean(testing_r2_scores)

# Generate parity plots
#for dataset, actuals, preds, avg_r2, filename in zip(
#    ["Training", "Testing"],
#    [all_train_actuals, all_test_actuals],
#    [all_train_preds, all_test_preds],
#    [average_training_r2, average_testing_r2],
#    ["Excel_Polynomial_training_parity_plot.png", "Excel_Polynomial_testing_parity_plot.png"]
#):
#    plt.figure()
#    plt.scatter(actuals, preds, alpha=0.7, label=f"{dataset} Data")
#    plt.plot([min(actuals), max(actuals)], [min(actuals), max(actuals)], 'k--', label="Perfect Prediction")
#    plt.xlabel("Actual Values")
#    plt.ylabel("Predicted Values")
#    plt.title(f"{dataset} Parity Plot ($R^2 = {avg_r2:.3f}$)")
#    plt.legend()
#    plt.grid()
#    plt.savefig(os.path.join(output_dir, filename))
#    plt.close()

#print(f"Results saved in {output_dir}")


Iteration No: 1 started. Evaluating function at random point.
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 0.0058
Function value obtained: -0.3601
Current minimum: -0.3601
Iteration No: 2 started. Evaluating function at random point.
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 0.0026
Function value obtained: 75.2956
Current minimum: -0.3601
Iteration No: 3 started. Evaluating function at random point.
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 0.0019
Function value obtained: 5.4614
Current minimum: -0.3601
Iteration No: 4 started. Evaluating function at random point.
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 0.0021
Function value obtained: -0.7222
Current minimum: -0.7222
Iteration No: 5 started. Evaluating function at random point.
Iteration No: 5 ended. Evaluation done at random point.
Time taken: 0.0020
Function value obtained: -0.0175
Current minimum: -0.7222
Iteration No: 6 started. E

In [2]:
# Final parity plot with both training and testing results
plt.figure(figsize=(6, 6))
plt.rcParams.update({'font.family': 'Times New Roman'})

# Scatter points
plt.scatter(all_train_actuals, all_train_preds, color='blue', alpha=0.7, edgecolor='k', label=f"Training ($R^2$ = {np.mean(training_r2_scores):.3f})")
plt.scatter(all_test_actuals, all_test_preds, color='red', alpha=0.7, edgecolor='k', label=f"Testing ($R^2$ = {np.mean(testing_r2_scores):.3f})")

# Diagonal reference line
min_val = min(min(all_train_actuals), min(all_test_actuals))
max_val = max(max(all_train_actuals), max(all_test_actuals))
plt.plot([min_val, max_val], [min_val, max_val], 'k--', linewidth=1)

# Labels and title
plt.xlabel("Actual Qst", fontsize=14)
plt.ylabel("Predicted Qst", fontsize=14)
plt.title("Polynomial KRR Prediction Results", fontsize=14)

# Styling
plt.xticks(fontsize=12, color='black')
plt.yticks(fontsize=12, color='black')
plt.grid(True, linestyle='--', linewidth=0.5)
plt.legend(loc='upper left', fontsize=12, frameon=False)
plt.gca().set_aspect('equal', adjustable='box')
plt.tight_layout()

# Save as PDF
plt.savefig(os.path.join(output_dir, "Poly_combined_parity_plot.pdf"), format='pdf')
plt.close()
