In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from collections import defaultdict

G = nx.read_gexf("../graphs/double_hits_filtered_upper75_with_theme_no_unknown.gexf")
G_sub = nx.read_gexf("../graphs/double_hits_filtered_upper75_with_theme_nomisc.gexf")

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [2]:
def calculate_centralities(graph):
    betweenness = nx.betweenness_centrality(graph)
    closeness = nx.closeness_centrality(graph)
    degree = nx.degree_centrality(graph)
    eigenvector = nx.eigenvector_centrality(graph)
    return betweenness, closeness, degree, eigenvector

def get_top_nodes(centrality_dict, n=5):
    return ', '.join(sorted(centrality_dict, key=centrality_dict.get, reverse=True)[:n])

def calculate_network_stats(graph):
    num_nodes = graph.number_of_nodes()
    num_edges = graph.number_of_edges()
    avg_degree = sum(dict(graph.degree()).values()) / num_nodes

    # Check if the graph is connected
    is_connected = nx.is_connected(graph)

    if is_connected:
        avg_short_path = nx.average_shortest_path_length(graph)
    else:
        # Calculate for the largest connected component
        largest_cc = max(nx.connected_components(graph), key=len)
        largest_cc_graph = graph.subgraph(largest_cc)
        avg_short_path = nx.average_shortest_path_length(largest_cc_graph)

    return {
        '#Nodes': num_nodes,
        '#Edges': num_edges,
        'Avg Degree': avg_degree,
        'Avg Cluster': nx.average_clustering(graph),
        'Avg Short Path': avg_short_path,
        'Is Connected': 'Yes' if is_connected else 'No'
    }

In [3]:
# centralities for the whole network
betweenness, closeness, degree, eigenvector = calculate_centralities(G)

In [4]:
network_stats = calculate_network_stats(G)

# Get top nodes for the whole network
top_nodes = {
    'Betweenness': get_top_nodes(betweenness),
    'Closeness': get_top_nodes(closeness),
    'Degree': get_top_nodes(degree),
    'Eigenvector': get_top_nodes(eigenvector)
}

# Prepare data for the whole network
network_data = {
    'Network': 'Whole Network',
    **network_stats,
    '#Community': len(set(nx.get_node_attributes(G, 'community').values())),
    **top_nodes
}

In [5]:
# Calculate stats for each community
community_data = defaultdict(list)
for node, community in nx.get_node_attributes(G, 'community').items():
    community_data[community].append(node)

community_rows = []
for community, nodes in community_data.items():
    subgraph = G.subgraph(nodes)
    betweenness, closeness, degree, eigenvector = calculate_centralities(subgraph)
    stats = calculate_network_stats(subgraph)
    top_nodes = {
        'Betweenness': get_top_nodes(betweenness),
        'Closeness': get_top_nodes(closeness),
        'Degree': get_top_nodes(degree),
        'Eigenvector': get_top_nodes(eigenvector)
    }
    community_rows.append({
        'Network': f'Community {community}',
        **stats,
        '#Community': 1,
        **top_nodes
    })

# Combine whole network and community data
all_data = [network_data] + community_rows
df = pd.DataFrame(all_data)

In [6]:
column_order = ['Network', '#Nodes', '#Edges', 'Avg Degree', 'Avg Cluster', 'Avg Short Path', '#Community',
                'Betweenness', 'Closeness', 'Degree', 'Eigenvector']
df = df[column_order]

# Round numeric columns
numeric_columns = ['#Nodes', '#Edges', 'Avg Degree', 'Avg Cluster', 'Avg Short Path', '#Community']
df[numeric_columns] = df[numeric_columns].round(2)

# Save to CSV
df.to_csv('../data/network_centrality_measures.csv', index=False)

In [7]:
def df_to_markdown(df):
    markdown = df.to_markdown(index=False, floatfmt=".2f")
    
    # Add a title
    title = "# Network Centrality Measures\n\n"
    
    # Add a description
    description = ("This table shows centrality measures for the whole network and each community. "
                   "The top five nodes are presented per centrality measure.\n\n")
    
    return title + description + markdown

# Convert to markdown and print
markdown_table = df_to_markdown(df)
print(markdown_table)

# Network Centrality Measures

This table shows centrality measures for the whole network and each community. The top five nodes are presented per centrality measure.

| Network                               |   #Nodes |   #Edges |   Avg Degree |   Avg Cluster |   Avg Short Path |   #Community | Betweenness                                                            | Closeness                                                              | Degree                                                                 | Eigenvector                                                            |
|:--------------------------------------|---------:|---------:|-------------:|--------------:|-----------------:|-------------:|:-----------------------------------------------------------------------|:-----------------------------------------------------------------------|:-----------------------------------------------------------------------|:---------------------------------------------------------------