In [None]:
!pip install catboost

In [None]:
!pip install optuna

In [None]:
import pickle
import random
import numpy as np
random.seed(2025)
np.random.seed(2025)
import pandas as pd
import glob
import urllib.request
import optuna
import shap
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import shuffle
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import train_test_split, StratifiedKFold
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_squared_error

In [None]:
url = "https://raw.githubusercontent.com/potentialreviewer/Optimal-SNA/main/data/Community_Detection.pkl"
file_name = "Community_Detection.pkl"

urllib.request.urlretrieve(url, file_name)

with open("Community_Detection.pkl", "rb") as f:
    community_detection = pickle.load(f)

In [None]:
community_detection = shuffle(community_detection, random_state=2025).reset_index(drop=True)

In [None]:
kbd = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
community_detection['Bin'] = kbd.fit_transform(community_detection[['Modularity']]).astype(int).ravel()

features = ['Dataset', 'AKE Method', 'Zeta', 'Edge Measure', 'Algorithm', 'RI', 'Isolated Nodes', 'Edge Count']

X = community_detection[features]
y = community_detection['Modularity']
strata = community_detection['Bin']

X_train, X_test, y_train, y_test, strata_train, strata_test = train_test_split(
    X, y, strata, test_size=0.2, stratify=strata, random_state=2025)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2025)
folds = list(skf.split(X_train, strata_train))

cat_features = ['Dataset', 'AKE Method', 'Edge Measure', 'Algorithm']

In [None]:
def objective(trial):
    params = {
        "iterations": 1000,
        "learning_rate": trial.suggest_float("learning_rate", 0.003, 0.3, log=True),
        "depth": trial.suggest_int("depth", 4, 10),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 0.3, 30, log=True),
        "loss_function": "RMSE",
        "border_count": 254,
        "od_pval": trial.suggest_float("od_pval", 1e-10, 1e-2, log=True),
        "od_wait": 50,
        "od_type": "IncToDec",
        "random_seed": 2025,
        "use_best_model": True,
        "verbose": 0,
        "random_strength": trial.suggest_float("random_strength", 0.1, 10.0, log=True),
        "eval_metric": "RMSE",
        "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"])
    }

    if params["bootstrap_type"] == "Bayesian":
        params["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0.1, 10, log=True)
    elif params["bootstrap_type"] == "Bernoulli":
        params["subsample"] = trial.suggest_float("subsample", 0.1, 1.0, log=True)
    elif params["bootstrap_type"] == "MVS":
        params["subsample"] = trial.suggest_float("subsample", 0.1, 1.0, log=True)

    rmse_values = []

    for fold_train_idx, fold_valid_idx in folds:
        X_fold_train, X_fold_valid = X_train.iloc[fold_train_idx], X_train.iloc[fold_valid_idx]
        y_fold_train, y_fold_valid = y_train.iloc[fold_train_idx], y_train.iloc[fold_valid_idx]

        model = CatBoostRegressor(**params)
        model.fit(X_fold_train, y_fold_train,
                  eval_set=(X_fold_valid, y_fold_valid),
                  cat_features=cat_features)

        predicts = model.predict(X_fold_valid)
        mse = mean_squared_error(y_fold_valid, predicts)
        rmse = np.sqrt(mse)
        rmse_values.append(rmse)

    mean_rmse = np.mean(rmse_values)
    std_rmse = np.std(rmse_values)

    return mean_rmse + std_rmse

In [None]:
study = optuna.create_study(direction="minimize", sampler=optuna.samplers.TPESampler(seed=2025))
study.optimize(objective, n_trials=200, timeout=10000)

In [None]:
print("Best trial:")
print(f"  RMSE: {study.best_trial.value}")
print("  Parameters:")
for key, value in study.best_trial.params.items():
    print(f"    {key}: {value}")

In [None]:
best_params = study.best_trial.params
best_params["iterations"] = 1000
best_params["loss_function"] = "RMSE"
best_params["border_count"] = 254
best_params["od_wait"] = 50
best_params["od_type"] = "IncToDec"
best_params["random_seed"] = 2025
best_params["use_best_model"] = True
best_params["verbose"] = 100
best_params["eval_metric"] = "RMSE"

X_model_train, X_model_val, y_model_train, y_model_val, strata_model_train, strata_model_val = train_test_split(
    X_train, y_train, strata_train, test_size=0.25, stratify=strata_train, random_state=2025)

final_model = CatBoostRegressor(**best_params)

final_model.fit(X_model_train, y_model_train,
                eval_set=(X_model_val, y_model_val),
                cat_features=cat_features)

eval_result = final_model.evals_result_

In [None]:
plt.figure(figsize=(10,6))
plt.plot(eval_result['validation']['RMSE'], label='Validation RMSE', color='red')
plt.plot(eval_result['learn']['RMSE'], label='Train RMSE', color='blue', linestyle='--')
plt.xlabel('Iteration')
plt.ylabel('RMSE')
plt.title('Learning Curve (Train v. Validation)')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
test_predicts = final_model.predict(X_test)
test_mse = mean_squared_error(y_test, test_predicts)
test_rmse = np.sqrt(test_mse)

In [None]:
print(f"Test RMSE: {test_rmse:.4f}")

In [None]:
explainer = shap.TreeExplainer(final_model)
shap_values = explainer(X_test)

In [None]:
shap.plots.beeswarm(shap_values, color="cool")

In [None]:
shap.plots.bar(shap_values)

In [None]:
plot_features = ["Edge Measure", "Algorithm", "AKE Method"]
subplot_labels = ["(a)", "(b)", "(c)"]

fig, axes = plt.subplots(1, 3, figsize=(12, 4))
axes = axes.flatten()

for i, feature in enumerate(plot_features):
    shap.plots.scatter(
        shap_values[:, feature],
        color=shap_values,
        show=False,
        cmap='cool',
        ax=axes[i]
    )
    axes[i].set_title(f"{subplot_labels[i]}", fontsize=12)

plt.tight_layout()
plt.show()

In [None]:
local_shap_interaction_values = explainer.shap_interaction_values(X_test)
global_shap_interaction_values = np.mean(np.abs(local_shap_interaction_values), axis=0)

In [None]:
plt.figure(figsize=(12, 10))
sns.heatmap(global_shap_interaction_values,
            xticklabels=X_test.columns,
            yticklabels=X_test.columns,
            annot=True, fmt=".3f",
            cmap="cool",
            annot_kws={"size": 12},
            cbar_kws={"shrink": 0.8, "aspect": 20})

plt.xticks(rotation=50, ha='right', fontsize=14)
plt.yticks(rotation=50, ha='right', fontsize=14)
plt.title("Heatmap of Global SHAP Interaction Values", fontsize=18, pad=20)

plt.tight_layout()
plt.show()