In [1]:
import pandas as pd
import numpy as np
import pickle
import glob
from functools import lru_cache
import math
import urllib.request

In [None]:
base_url = "https://raw.githubusercontent.com/potentialreviewer/Optimal-SNA/main/data/"

files = [
    "SemEval-2010_AKE_results.pkl",
    "NUS_AKE_results.pkl",
    "Inspec_AKE_results.pkl",
    "KDD_AKE_results.pkl",
    "WWW_AKE_results.pkl",
    "SemEval-2017_AKE_results.pkl",
    "DUC-2001_AKE_results.pkl",
    "500N-KP-Crowd_AKE_results.pkl"

]

for f in files:
    urllib.request.urlretrieve(base_url + f, f)

In [2]:
dfs = []

for path in glob.glob('*_AKE_results.pkl'):
    dataset_name = path.replace('_AKE_results.pkl', '')
    df = pd.read_pickle(path)
    df = df.drop(columns=['F', 'pF'])
    df = df.rename(columns={'Model': 'AKE Method', 'hF': 'hF1'})
    df['Dataset'] = dataset_name
    dfs.append(df)

AKE_dataset = pd.concat(dfs, ignore_index=True)

In [None]:
url = "https://raw.githubusercontent.com/potentialreviewer/Optimal-SNA/main/data/Community_Detection.pkl"
file_name = "Community_Detection.pkl"

urllib.request.urlretrieve(url, file_name)

In [3]:
with open("Community_Detection.pkl", "rb") as f:
    community_detection = pickle.load(f)

In [4]:
community_detection = community_detection.drop(columns=['Parameters', 'Isolated Nodes', 'Edge Count'])
DP = community_detection.merge(AKE_dataset, on=['Dataset', 'AKE Method'], how='left')

In [5]:
DP

Unnamed: 0,Dataset,AKE Method,Zeta,Edge Measure,Algorithm,Modularity,RI,hF1
0,500N-KP-Crowd,TF,0.25,CF,CNM,0.064823,0.874820,0.363168
1,500N-KP-Crowd,TF,0.25,CF,Louvain,0.055147,0.874820,0.363168
2,500N-KP-Crowd,TF,0.25,CF,Leiden,0.075047,0.874820,0.363168
3,500N-KP-Crowd,TF,0.25,CF,FLPA,0.000000,0.874820,0.363168
4,500N-KP-Crowd,TF,0.25,Dice,CNM,0.189070,0.852951,0.363168
...,...,...,...,...,...,...,...,...
3259,SemEval-2017,LMRANK,0.75,Jaccard,FLPA,0.644908,0.627747,0.289670
3260,SemEval-2017,LMRANK,0.75,Cosine,CNM,0.794301,0.000000,0.289670
3261,SemEval-2017,LMRANK,0.75,Cosine,Louvain,0.794301,0.000000,0.289670
3262,SemEval-2017,LMRANK,0.75,Cosine,Leiden,0.794301,0.000000,0.289670


In [6]:
def compute_objectives(DP):
    DP["Objective Function"] = DP["hF1"] + DP["RI"] + DP["Modularity"]
    return DP

def dp_optimal_policy(DP, top_n=5):
    datasets = DP["Dataset"].unique()

    for dataset in datasets:
        dataset_df = DP[DP["Dataset"] == dataset].copy()
        print(f"\n===== {dataset} =====")

        # ===== Stage 3: f3* =====
        stage3_groups = dataset_df.groupby(["AKE Method", "Zeta", "Edge Measure"], sort=False)
        f3_star = {}
        for triple, group in stage3_groups:
            max_Q = group["Modularity"].max()
            best_algorithms = group[group["Modularity"] == max_Q]["Algorithm"].tolist()
            f3_star[triple] = {"max_Q": max_Q, "best_algorithms": best_algorithms, "RI": group["RI"].iloc[0]}

        # ===== Stage 2: f2* =====
        f2_star = {}
        for ake in dataset_df["AKE Method"].unique():
            ake_group = dataset_df[dataset_df["AKE Method"] == ake]
            candidates = []
            for zeta in ake_group["Zeta"].unique():
                zeta_group = ake_group[ake_group["Zeta"] == zeta]
                for edge in zeta_group["Edge Measure"].unique():
                    triple = (ake, zeta, edge)
                    stage3_information = f3_star[triple]
                    f2_value = stage3_information["RI"] + stage3_information["max_Q"]
                    candidates.append((f2_value, zeta, edge, stage3_information["best_algorithms"]))
            max_f2 = max(c[0] for c in candidates)
            f2_star[ake] = [c for c in candidates if c[0] == max_f2]

        # ===== Stage 1: f1* =====
        dp_rows = []
        for ake in dataset_df["AKE Method"].unique():
            hF1_value = dataset_df[dataset_df["AKE Method"] == ake]["hF1"].iloc[0]
            for f2_value, zeta, edge, best_algorithms in f2_star[ake]:
                total_value = hF1_value + f2_value
                for algorithm in best_algorithms:
                    dp_rows.append({
                        "AKE": ake,
                        "Zeta": zeta,
                        "Edge": edge,
                        "Algorithm": algorithm,
                        "hF1": hF1_value,
                        "RI": f3_star[(ake, zeta, edge)]["RI"],
                        "Q": f3_star[(ake, zeta, edge)]["max_Q"],
                        "Objective Function": total_value
                    })

        dp_rows_sorted = sorted(dp_rows, key=lambda x: x["Objective Function"], reverse=True)
        count = 0
        last_value = None
        for r in dp_rows_sorted:
            if count >= top_n and r["Objective Function"] != last_value:
                break
            print(f"AKE: {r['AKE']}, Zeta: {r['Zeta']}, Edge: {r['Edge']}, "
                  f"Algorithm: {r['Algorithm']}, hF1: {r['hF1']:.16f}, "
                  f"RI: {r['RI']:.16f}, Q: {r['Q']:.16f}, "
                  f"Objective Function: {r['Objective Function']:.16f}")
            last_value = r["Objective Function"]
            count += 1

In [7]:
DP = compute_objectives(DP)
dp_optimal_policy(DP, top_n=5)


===== 500N-KP-Crowd =====
AKE: TfIdf, Zeta: 0.25, Edge: Cosine, Algorithm: Leiden, hF1: 0.3365800222744511, RI: 0.9965551724137929, Q: 0.5469087983804564, Objective Function: 1.8800439930687003
AKE: PositionRank, Zeta: 0.25, Edge: Cosine, Algorithm: Leiden, hF1: 0.2800983315961591, RI: 0.9836035398230092, Q: 0.5076657527071793, Objective Function: 1.7713676241263476
AKE: SingleRank, Zeta: 0.25, Edge: Cosine, Algorithm: CNM, hF1: 0.2548865730269983, RI: 0.9775089108910888, Q: 0.5299562422588925, Objective Function: 1.7623517261769797
AKE: SingleRank, Zeta: 0.25, Edge: Cosine, Algorithm: Leiden, hF1: 0.2548865730269983, RI: 0.9775089108910888, Q: 0.5299562422588925, Objective Function: 1.7623517261769797
AKE: LMRANK, Zeta: 0.25, Edge: CF, Algorithm: CNM, hF1: 0.1656400318240977, RI: 0.9496580645161289, Q: 0.5608365688217426, Objective Function: 1.6761346651619691
AKE: LMRANK, Zeta: 0.25, Edge: CF, Algorithm: Leiden, hF1: 0.1656400318240977, RI: 0.9496580645161289, Q: 0.5608365688217426,

In [8]:
DP

Unnamed: 0,Dataset,AKE Method,Zeta,Edge Measure,Algorithm,Modularity,RI,hF1,Objective Function
0,500N-KP-Crowd,TF,0.25,CF,CNM,0.064823,0.874820,0.363168,1.302810
1,500N-KP-Crowd,TF,0.25,CF,Louvain,0.055147,0.874820,0.363168,1.293134
2,500N-KP-Crowd,TF,0.25,CF,Leiden,0.075047,0.874820,0.363168,1.313035
3,500N-KP-Crowd,TF,0.25,CF,FLPA,0.000000,0.874820,0.363168,1.237988
4,500N-KP-Crowd,TF,0.25,Dice,CNM,0.189070,0.852951,0.363168,1.405189
...,...,...,...,...,...,...,...,...,...
3259,SemEval-2017,LMRANK,0.75,Jaccard,FLPA,0.644908,0.627747,0.289670,1.562325
3260,SemEval-2017,LMRANK,0.75,Cosine,CNM,0.794301,0.000000,0.289670,1.083971
3261,SemEval-2017,LMRANK,0.75,Cosine,Louvain,0.794301,0.000000,0.289670,1.083971
3262,SemEval-2017,LMRANK,0.75,Cosine,Leiden,0.794301,0.000000,0.289670,1.083971
