In [None]:
!pip install igraph

In [None]:
!pip install optuna

In [3]:
import random
import numpy as np
random.seed(2025)
np.random.seed(2025)
import pandas as pd
import pickle
import glob
import networkx as nx
import igraph as ig
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [4]:
def leiden(G, seed=2025, resolution=1.0, weight="weight", objective="modularity", beta=0.1, n_iterations=200):
    random.seed(seed)

    nodes = sorted(G.nodes())
    node_to_index = {n: i for i, n in enumerate(nodes)}
    edges = list(G.edges(data=True))
    g = ig.Graph(n=len(G.nodes()), edges=[(node_to_index[u], node_to_index[v]) for u, v, _ in edges])
    g.es["weight"] = [similarity["weight"] for _, _, similarity in edges]

    partition = g.community_leiden(objective_function=objective, weights=g.es["weight"], resolution=resolution, beta=beta, n_iterations=n_iterations)
    communities = [set(nodes[v] for v in community) for community in partition]

    return communities

In [5]:
def tune_algorithm(G, algorithm, seed=2025, n_trials=50):
    def objective(trial):
        if algorithm == "CNM":
            communities = list(nx.algorithms.community.greedy_modularity_communities(G, weight="weight"))
            return nx.algorithms.community.modularity(G, communities, weight="weight")

        elif algorithm == "Louvain":
            communities = list(nx.algorithms.community.louvain_communities(G, seed=seed, weight="weight"))
            return nx.algorithms.community.modularity(G, communities, weight="weight")

        elif algorithm == "Leiden":
            beta = trial.suggest_float("beta", 0.1, 10, log=True)
            communities = list(leiden(G, seed=seed, weight="weight", objective="modularity", beta=beta, n_iterations=200))
            return nx.algorithms.community.modularity(G, communities, weight="weight")

        elif algorithm == "FLPA":
            communities = list(nx.algorithms.community.fast_label_propagation_communities(G, seed=seed, weight="weight"))
            return nx.algorithms.community.modularity(G, communities, weight="weight")

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_trials)
    best_trial = study.best_trial
    return best_trial.value, best_trial.params

In [6]:
def evaluate_graph_dict_optuna(graph_dict, dataset_name, seed=2025, n_trials=50):
    records = []
    excluded_methods = {"Euclidean", "Manhattan", "DMD"}
    included_methods = ["CNM", "Louvain", "Leiden", "FLPA"]

    for ake_method in graph_dict[dataset_name]:
        for zeta in graph_dict[dataset_name][ake_method]:
            for edge_method, G in graph_dict[dataset_name][ake_method][zeta].items():
                if edge_method in excluded_methods:
                    continue

                for algorithm in included_methods:
                    best_score, best_params = tune_algorithm(G, algorithm, seed=seed, n_trials=n_trials)
                    parameters = [f"{k}={v}" for k, v in best_params.items()]

                    records.append({
                        "Dataset": dataset_name,
                        "AKE Method": ake_method,
                        "Zeta": zeta,
                        "Edge Measure": edge_method,
                        "Algorithm": algorithm,
                        "Modularity": best_score,
                        "Parameters": parameters
                    })

    df = pd.DataFrame(records)

    modularity_values = df["Modularity"].values
    modularity_values = np.where(modularity_values < 0, 0, modularity_values)
    df["Modularity"] = modularity_values

    return df

In [7]:
import urllib.request

In [None]:
base_url = "https://raw.githubusercontent.com/potentialreviewer/Optimal-SNA/main/data/"

files = [
    "SemEval-2010_graph_dict.pkl",
    "NUS_graph_dict.pkl",
    "Inspec_graph_dict.pkl",
    "KDD_graph_dict.pkl",
    "WWW_graph_dict.pkl",
    "SemEval-2017_graph_dict.pkl",
    "DUC-2001_graph_dict.pkl",
    "500N-KP-Crowd_graph_dict.pkl"

]

for f in files:
    urllib.request.urlretrieve(base_url + f, f)

In [9]:
all_results = []

for path in glob.glob("*_graph_dict.pkl"):
    dataset_name = path.split("_graph_dict.pkl")[0]
    with open(path, "rb") as f:
        graph_dict = pickle.load(f)

    df = evaluate_graph_dict_optuna(graph_dict, dataset_name, seed=2025, n_trials=50)
    all_results.append(df)

results_df = pd.concat(all_results, ignore_index=True)
results_df['Zeta'] = results_df['Zeta'].str.replace('_', '.', regex=False).astype(float)

In [None]:
files2 = [
    "SemEval-2010_percolation_results.pkl",
    "NUS_percolation_percolation_results.pkl",
    "Inspec_percolation_results.pkl",
    "KDD_percolation_results.pkl",
    "WWW_percolation_results.pkl",
    "SemEval-2017_percolation_results.pkl",
    "DUC-2001_percolation_results.pkl",
    "500N-KP-Crowd_percolation_results.pkl"

]

for f2 in files2:
    urllib.request.urlretrieve(base_url + f2, f2)

In [10]:
from sklearn.utils import shuffle

In [11]:
percolation_dataset = pd.concat([pd.read_pickle(path) for path in glob.glob('*_percolation_results.pkl')], ignore_index=True)
percolation_dataset.drop(columns=['All Traces'], inplace=True)
percolation_dataset['Zeta'] = percolation_dataset['Zeta'].str.replace('_', '.', regex=False).astype(float)
percolation_dataset = shuffle(percolation_dataset, random_state=2025).reset_index(drop=True)

In [12]:
percolation_subset = percolation_dataset[
    ["Dataset", "AKE Method", "Zeta", "Edge Measure", "RI", "Isolated Nodes", "Edge Count"]
]

new_results_df = results_df.merge(
    percolation_subset,
    on=["Dataset", "AKE Method", "Zeta", "Edge Measure"],
    how="left"
)

In [13]:
community_detection_dataset = new_results_df

In [None]:
community_detection_dataset

In [15]:
with open("Community_Detection.pkl", "wb") as f:
    pickle.dump(community_detection_dataset, f)