In [1]:
!pip install kneed

Collecting kneed
  Downloading kneed-0.8.5-py3-none-any.whl (10 kB)
Installing collected packages: kneed
Successfully installed kneed-0.8.5


In [None]:
import os
import numpy as np
import networkx as nx
import pandas as pd
import time
from sklearn.cluster import KMeans
from sklearn.svm import SVR
from scipy.stats import kendalltau
from kneed import KneeLocator

# --- Step 1: Load Data ---
def load_graph_from_excel(file_path):
    """Loads graph from an Excel file, removes self-loops, and filters nodes to those in SIR sheet."""
    df = pd.read_excel(file_path, sheet_name="Sheet1")
    G = nx.Graph()
    G.add_edges_from(zip(df["source_column"], df["target_column"]))
    G.remove_edges_from(nx.selfloop_edges(G))
    return G

# --- Step 2: Feature Computation ---
def compute_connectivity_vector(G, node, node_to_index):
    """Returns the connectivity vector for a given node."""
    connectivity_vector = np.zeros(len(G.nodes()))
    for neighbor in G.neighbors(node):
        if neighbor in node_to_index:
            connectivity_vector[node_to_index[neighbor]] = 1
    return connectivity_vector

def compute_degree_vector(G):
    """Returns the degree vector for all nodes."""
    return np.array([G.degree[node] for node in G.nodes()])

def compute_coreness_vector(G):
    """Computes the extended coreness score for all nodes."""
    coreness = nx.core_number(G)
    eks = {node: coreness[node] * G.degree[node] + sum(coreness[n] * G.degree[n] for n in G.neighbors(node))
           for node in G.nodes()}
    return np.array([eks[node] for node in G.nodes()])

def compute_feature_vector(G, alpha1=1.0, alpha2=3.0):
    """Computes feature vectors for all nodes."""
    node_to_index = {node: idx for idx, node in enumerate(G.nodes())}
    degree_vector = compute_degree_vector(G)
    coreness_vector = compute_coreness_vector(G)
    feature_vectors = {}
    for node in G.nodes():
        connectivity_vector = compute_connectivity_vector(G, node, node_to_index)
        feature_vector = connectivity_vector * (alpha1 * degree_vector + alpha2 * coreness_vector)
        feature_vectors[node] = feature_vector
    return feature_vectors

# --- Step 3: Optimal Clusters ---
def optimal_k(features):
    distortions = []
    K_range = range(1, min(10, len(features)))
    for k in K_range:
        kmeans = KMeans(n_clusters=k, random_state=42).fit(features)
        distortions.append(kmeans.inertia_)
    knee = KneeLocator(K_range, distortions, curve="convex", direction="decreasing")
    return knee.elbow if knee.elbow else 3

# --- Step 4: Clustering & Sample Selection ---
def cluster_nodes(feature_vectors):
    feature_matrix = np.array(list(feature_vectors.values()))
    k = optimal_k(feature_matrix)
    kmeans = KMeans(n_clusters=k, random_state=42).fit(feature_matrix)
    clusters = {i: [] for i in range(k)}
    for node, label in zip(feature_vectors.keys(), kmeans.labels_):
        clusters[label].append(node)
    return clusters, k

def select_training_samples(clusters, s, beta_values):
    """
    For each cluster, sample nodes that are present in beta_values.
    This ensures that the returned sampled_x and sampled_y have equal length.
    """
    sampled_x, sampled_y = [], []
    total_nodes = sum(len(cluster) for cluster in clusters.values())
    sample_size_per_cluster = max(1, int(s * total_nodes / len(clusters)))
    for nodes in clusters.values():
        valid_nodes = [node for node in nodes if node in beta_values.index]
        if not valid_nodes:
            continue
        sampled_nodes = np.random.choice(valid_nodes, min(sample_size_per_cluster, len(valid_nodes)), replace=False)
        sampled_x.extend(sampled_nodes)
        sampled_y.extend(beta_values.loc[sampled_nodes].values)
    return sampled_x, sampled_y

# --- Step 5: Train SVR Model ---
def train_svr(X_train, y_train):
    svr = SVR(kernel="rbf", gamma="scale")
    svr.fit(X_train, y_train)
    return svr

# --- Step 6: Prediction & Evaluation ---
def predict_vitality(G, svr, feature_vectors, alpha=0.25):
    EML = {}
    for node in G.nodes():
        predicted = svr.predict([feature_vectors[node]])[0]
        neighbor_sum = sum(svr.predict([feature_vectors[neighbor]])[0] for neighbor in G.neighbors(node))
        EML[node] = predicted + alpha * neighbor_sum
    return sorted(EML.items(), key=lambda x: x[1], reverse=True)

def measure_execution_time(method, G):
    start_time = time.time()
    ranked_nodes = method(G)
    end_time = time.time()
    execution_time = end_time - start_time
    return ranked_nodes, execution_time

def compute_monotonicity(G, ranked_nodes):
    ranks = [score for node, score in ranked_nodes]
    unique_ranks = list(set(ranks))
    n = G.number_of_nodes()
    nr_dict = {rank: ranks.count(rank) for rank in unique_ranks}
    nr_sum = sum(nr * (nr - 1) for nr in nr_dict.values())
    return (1 - nr_sum / (n * (n - 1))) ** 2

def compute_rbo(sigma, R, alpha=0.9):
    """
    Compute the Rank-Biased Overlap (RBO) between two rankings sigma and R.
    """
    def A(sigma, R, f):
        sigma_f = set(sigma[:f])
        R_f = set(R[:f])
        union = sigma_f | R_f
        return len(sigma_f & R_f) / len(union) if union else 1
    n = max(len(sigma), len(R))
    return (1 - alpha) * sum(alpha**(f - 1) * A(sigma, R, f) for f in range(1, n + 1))

def compute_spread_impact(G, ranked_nodes, spread_power, f_values):
    si_scores = {}
    n = len(G.nodes())
    for f in f_values:
        top_f_nodes = [node for node, _ in ranked_nodes[:int(f * n)]]
        # Use get() to safely retrieve spread_power value (defaulting to 0 if missing)
        si_scores[f'SI {f:.2f}'] = sum(spread_power.get(node, 0) for node in top_f_nodes) / (f * n)
    return si_scores

def compute_mrr(ground_truth, ranked_list, top_ratios):
    mrr_scores = {}
    num_nodes = len(ground_truth)
    for ratio in top_ratios:
        k = max(1, int(ratio * num_nodes))
        reciprocal_ranks = []
        for node in ground_truth[:k]:
            if node in ranked_list:
                rank = ranked_list.index(node) + 1
                reciprocal_ranks.append(1 / rank)
        mrr_scores[f'MRR {ratio:.2f}'] = sum(reciprocal_ranks) / k if reciprocal_ranks else 0
    return mrr_scores

def compute_jaccard_similarity(set1, set2):
    return len(set1 & set2) / len(set1 | set2) if set1 | set2 else 0

# --- Main Execution for a Single File ---
def main(file_path, beta_cols):
    print(f"\nProcessing dataset: {os.path.basename(file_path)}")
    G = load_graph_from_excel(file_path)
    feature_vectors = compute_feature_vector(G)
    sir_data = pd.read_excel(file_path, sheet_name="SIR").set_index("Node")
    # Convert sir_data index to same type as graph nodes
    node_type = type(next(iter(G.nodes())))
    sir_data.index = sir_data.index.astype(node_type)
    # Optionally, filter graph to nodes in SIR data
    valid_nodes = set(sir_data.index)
    G = G.subgraph(valid_nodes).copy()

    # Evaluation parameters.
    k_factors = [0.01, 0.03, 0.05, 0.07, 0.085, 0.10]
    f_values = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.10]
    top_ratios = k_factors  # For MRR computation.

    dataset_results = []
    dataset_name = os.path.splitext(os.path.basename(file_path))[0]
    num_nodes = len(G.nodes())

    for beta_col in beta_cols:
        print(f"Processing beta column: {beta_col}")
        beta_values = sir_data[beta_col]

        clusters, k = cluster_nodes(feature_vectors)
        sampled_x, sampled_y = select_training_samples(clusters, 0.30, beta_values)

        X_train = np.array([feature_vectors[node] for node in sampled_x])
        y_train = np.array(sampled_y)
        if len(X_train) != len(y_train):
            print(f"Warning: X_train has {len(X_train)} samples but y_train has {len(y_train)} samples. Skipping beta {beta_col}.")
            continue

        svr = train_svr(X_train, y_train)
        ranked_nodes, exec_time = measure_execution_time(lambda G: predict_vitality(G, svr, feature_vectors), G)

        monotonicity = compute_monotonicity(G, ranked_nodes)

        # Ground truth ranking (sorted nodes by spread power in descending order).
        true_sir_ranking = beta_values.sort_values(ascending=False).index.tolist()
        predicted_ranking = [node for node, _ in ranked_nodes]

        common_nodes = set(true_sir_ranking) & set(predicted_ranking)
        filtered_sigma = [node for node in true_sir_ranking if node in common_nodes]
        filtered_predicted = [node for node in predicted_ranking if node in common_nodes]

        true_rank_dict = {node: rank for rank, node in enumerate(filtered_sigma)}
        predicted_rank_list = [true_rank_dict[node] for node in filtered_predicted]
        kendall_tau, p_value = kendalltau(predicted_rank_list, list(range(len(predicted_rank_list))))

        jaccard_scores = {}
        for k_factor in k_factors:
            k_val = max(1, int(k_factor * num_nodes))
            top_sigma = set(filtered_sigma[:k_val])
            top_predicted = set(filtered_predicted[:k_val])
            jaccard_scores[f'Jaccard k={k_val}'] = compute_jaccard_similarity(top_sigma, top_predicted)

        spread_power = beta_values.to_dict()
        si_scores = compute_spread_impact(G, ranked_nodes, spread_power, f_values)

        mrr_scores = compute_mrr(filtered_sigma, filtered_predicted, top_ratios)

        # Compute RBO scores using the compute_rbo function.
        rbo_scores = {}
        for f in f_values:
            top = max(1, int(f * len(filtered_sigma)))
            sigma_top = filtered_sigma[:top]
            predicted_top = filtered_predicted[:top]
            rbo_scores[f'RBO {f:.2f}'] = compute_rbo(sigma_top, predicted_top)

        result_row = [
            dataset_name,
            'EML_SVR',
            exec_time,
            monotonicity,
            beta_col,
            kendall_tau,
            p_value
        ]
        result_row += [jaccard_scores[f'Jaccard k={max(1, int(k_factor * num_nodes))}'] for k_factor in k_factors]
        result_row += [si_scores[f'SI {f:.2f}'] for f in f_values]
        result_row += [rbo_scores[f'RBO {f:.2f}'] for f in f_values]
        result_row += [mrr_scores[f'MRR {k_factor:.2f}'] for k_factor in top_ratios]

        dataset_results.append(result_row)

    columns = (['Dataset', 'Method', 'Execution Time', 'Monotonicity', 'Beta', 'Kendall Tau', 'P-Value'] +
               [f'Jaccard k={max(1, int(k_factor * num_nodes))}' for k_factor in k_factors] +
               [f'SI {f:.2f}' for f in f_values] +
               [f'RBO {f:.2f}' for f in f_values] +
               [f'MRR {k_factor:.2f}' for k_factor in top_ratios])

    result_df = pd.DataFrame(dataset_results, columns=columns)
    output_file = f'{dataset_name}_results.xlsx'
    result_df.to_excel(output_file, index=False)
    print(f"Processed {dataset_name}, results saved in {output_file}")

# --- Process All Excel Files in the Dataset Folder ---
def process_dataset_folder(folder_path, beta_cols):
    for file in os.listdir(folder_path):
        if file.endswith('.xlsx'):
            file_path = os.path.join(folder_path, file)
            try:
                main(file_path, beta_cols)
            except Exception as e:
                print(f"Error processing {file}: {e}")

# --- Run the Program ---
dataset_folder = 'dataset'
beta_columns = ['Beta_0', 'Beta_1', 'Beta_2', 'Beta_3']
process_dataset_folder(dataset_folder, beta_columns)



Processing dataset: facebook_withSIR.xlsx
Processing beta column: Beta_0
