# Social Network Analysis for Data Warehouses

## 3. Network Connectivity Analysis: Betweenness Centrality

**Betweenness Centrality** measures the fraction of all possible geodesics that pass through a node. If the betweenness is high, the node is potentially a crucial go-between (thus the name) and has a brokerage capability.

### 3.1. Import dependencies

In [None]:
import itertools
import pprint
from typing import Dict, List

import matplotlib.pyplot as plt
import networkx as nx
from networkx.classes.graph import Graph
import pandas as pd
from pandas import Series
import numpy as np

### 3.2. Utility functions

In [None]:
def load_graph_from_csv(file: str) -> Graph:
    return nx.read_edgelist(file, delimiter=",", create_using=nx.DiGraph)

In [None]:
def format_graph_info(graph_id: str, graph: Graph) -> str:
    return (
        f"{graph_id.upper()} GRAPH INFO:\n"
        f"  Number of nodes: {nx.number_of_nodes(graph)}\n"
        f"  Number of edges: {nx.number_of_edges(graph)}\n"
        f"  Density: {nx.density(graph)}\n"
        f"  Average clustering coefficient: {nx.average_clustering(graph)}\n"
        f"  Transitivity: {nx.transitivity(graph)}"
    )

In [None]:
def get_betweenness_centrality(graph: Graph) -> Dict[str, float]:
    return nx.betweenness_centrality(graph, normalized=False)

In [None]:
def get_betweenness_centrality_list(bet_cent: Dict[str, float]) -> List[float]:
    return [betweenness for _, betweenness in bet_cent.items()]

In [None]:
def get_betweenness_centrality_series(bet_cent: Dict[str, float]) -> Series:
    return pd.Series(get_betweenness_centrality_list(bet_cent))

In [None]:
def draw_betweenness_centrality_based_network(
    graph: Graph, bet_cent: Dict[str, float]
) -> None:
    sorted_bet_cent = sorted(get_betweenness_centrality_list(bet_cent))

    lowest_non_zero_bet_cent = next(
        betweenness for betweenness in sorted_bet_cent if betweenness > 0
    )
    highest_bet_cent = sorted_bet_cent[-1]
    scaling_factor = 3500 / highest_bet_cent

    normalized_node_params = [
        (betweenness or (lowest_non_zero_bet_cent / 10)) * scaling_factor
        for betweenness in sorted_bet_cent
    ]

    pos = nx.spring_layout(graph)
    plt.figure(figsize=(12, 8))
    nx.draw_networkx(
        graph,
        pos=pos,
        with_labels=False,
        edge_color="dimgray",
        node_color=normalized_node_params,
        node_size=normalized_node_params,
    )
    plt.axis("off")

In [None]:
def plot_betweenness_centrality_descriptive_stats(
    bet_cent: Dict[str, float], color: str
) -> None:
    bet_cent_list = get_betweenness_centrality_list(bet_cent)

    plt.figure(figsize=(12, 3))
    plt.boxplot(bet_cent_list, vert=False, flierprops=dict(markerfacecolor=color))
    plt.xlabel("Betweenness Centrality")

In [None]:
def plot_betweenness_centrality_ranking(
    bet_cent: Dict[str, float], fmt: str, stop_at: float = 0
) -> None:
    sorted_bet_cent = sorted(get_betweenness_centrality_list(bet_cent), reverse=True)

    if stop_at:
        stop_at_index = sorted_bet_cent.index(stop_at)
        while sorted_bet_cent[stop_at_index] == sorted_bet_cent[stop_at_index + 1]:
            stop_at_index += 1

    adjusted_list = [None]
    adjusted_list.extend(
        sorted_bet_cent[: stop_at_index + 1] if stop_at else sorted_bet_cent
    )

    plt.figure(figsize=(12, 3))
    plt.plot(adjusted_list, fmt)
    plt.xlabel("Rank")
    plt.ylabel("Betweenness Centrality")

### 3.3. Load the anonymized graphs from CSV and cache their Betweenness Centrality data

In [None]:
anon_data_folder = "../data/anonymized"

small_graph_1 = load_graph_from_csv(f"{anon_data_folder}/anon-dataset-small_1.csv")
print(f'{format_graph_info("small(1)", small_graph_1)}\n')

small_graph_2 = load_graph_from_csv(f"{anon_data_folder}/anon-dataset-small_2.csv")
print(f'{format_graph_info("small(2)", small_graph_2)}\n')

medium_graph = load_graph_from_csv(f"{anon_data_folder}/anon-dataset-medium.csv")
print(f'{format_graph_info("medium", medium_graph)}\n')

large_graph = load_graph_from_csv(f"{anon_data_folder}/anon-dataset-large.csv")
print(f'{format_graph_info("large", large_graph)}')

In [None]:
small_graph_1_bc = get_betweenness_centrality(small_graph_1)
small_graph_2_bc = get_betweenness_centrality(small_graph_2)
medium_graph_bc = get_betweenness_centrality(medium_graph)
large_graph_bc = get_betweenness_centrality(large_graph)

### 3.4. Draw the networks, plot Betweenness Centrality, etc

#### 3.4.1. Small network 1

In [None]:
draw_betweenness_centrality_based_network(small_graph_1, small_graph_1_bc)

In [None]:
sg_1_bc_series = get_betweenness_centrality_series(small_graph_1_bc)
sg_1_bc_series.describe()

In [None]:
plot_betweenness_centrality_descriptive_stats(small_graph_1_bc, "blue")

In [None]:
plot_betweenness_centrality_ranking(small_graph_1_bc, "bo-")

In [None]:
plot_betweenness_centrality_ranking(
    small_graph_1_bc,
    "bo-",
    stop_at=sg_1_bc_series.quantile(0.97, interpolation="nearest"),
)

#### 3.4.2. Small network 2

In [None]:
draw_betweenness_centrality_based_network(small_graph_2, small_graph_2_bc)

In [None]:
sg_2_bc_series = get_betweenness_centrality_series(small_graph_2_bc)
print(sg_2_bc_series.describe())

In [None]:
plot_betweenness_centrality_descriptive_stats(small_graph_2_bc, "red")

In [None]:
plot_betweenness_centrality_ranking(small_graph_2_bc, "ro-")

In [None]:
plot_betweenness_centrality_ranking(
    small_graph_2_bc,
    "ro-",
    stop_at=sg_2_bc_series.quantile(0.97, interpolation="nearest"),
)

#### 3.4.3. Medium network

In [None]:
draw_betweenness_centrality_based_network(medium_graph, medium_graph_bc)

In [None]:
mg_bc_series = get_betweenness_centrality_series(medium_graph_bc)
print(mg_bc_series.describe())

In [None]:
plot_betweenness_centrality_descriptive_stats(medium_graph_bc, "green")

In [None]:
plot_betweenness_centrality_ranking(medium_graph_bc, "go-")

In [None]:
plot_betweenness_centrality_ranking(
    medium_graph_bc, "go-", stop_at=mg_bc_series.quantile(0.99, interpolation="nearest")
)

#### 3.4.4. Large network

In [None]:
draw_betweenness_centrality_based_network(large_graph, large_graph_bc)

In [None]:
lg_bc_series = get_betweenness_centrality_series(large_graph_bc)
print(lg_bc_series.describe())

In [None]:
plot_betweenness_centrality_descriptive_stats(large_graph_bc, "magenta")

In [None]:
plot_betweenness_centrality_ranking(large_graph_bc, "mo-")

In [None]:
plot_betweenness_centrality_ranking(
    large_graph_bc, "mo-", stop_at=lg_bc_series.quantile(0.993, interpolation="nearest")
)