# Social Network Analysis applied to Data Warehouses

## 3. Network Connectivity Analysis: Betweenness Centrality

Some measures depend not only on a node's direct connections but also on its structural position in the network. **Betweenness centrality**, which refers to the number of times a given node is found in the shortest paths among other nodes, is one of them. Nodes with large betweenness are the primary conductors of information, also called network influencers. In other words, if the betweenness is high, the node is potentially a crucial go-between (thus the name) and has a brokerage capability (Zinoviev, 2018). The shortest paths in a graph can eventually materialize as data pipelines in the Data Warehouse, and betweenness centrality brings insights about the assets that have a higher potential to cause such workloads to fail.

### 3.1. Import dependencies

In [None]:
import itertools
from typing import Dict, List, Tuple

import matplotlib.pyplot as plt
import networkx as nx
from networkx.classes.graph import Graph
import numpy as np
import pandas as pd
from pandas import Series

### 3.2. Utility functions

In [None]:
def load_graph_from_csv(file: str) -> Graph:
    return nx.read_edgelist(file, delimiter=",", create_using=nx.DiGraph)

In [None]:
def format_graph_info(graph_id: str, graph: Graph) -> str:
    return (
        f"{graph_id.upper()} GRAPH INFO:\n"
        f"  Number of nodes: {nx.number_of_nodes(graph)}\n"
        f"  Number of edges: {nx.number_of_edges(graph)}\n"
        f"  Density: {nx.density(graph)}\n"
        f"  Average clustering coefficient: {nx.average_clustering(graph)}\n"
        f"  Transitivity: {nx.transitivity(graph)}"
    )

In [None]:
def get_betweenness_centrality(graph: Graph) -> Dict[str, float]:
    return nx.betweenness_centrality(graph, normalized=False)

In [None]:
def get_betweenness_centrality_list(bet_cent: Dict[str, float]) -> List[float]:
    return [betweenness for _, betweenness in bet_cent.items()]

In [None]:
def get_betweenness_centrality_series(bet_cent: Dict[str, float]) -> Series:
    return pd.Series(get_betweenness_centrality_list(bet_cent))

In [None]:
def draw_betweenness_centrality_based_network(
    graph: Graph, bet_cent: Dict[str, float]
) -> None:
    sorted_bet_cent = sorted(get_betweenness_centrality_list(bet_cent))

    lowest_non_zero_bet_cent = next(
        betweenness for betweenness in sorted_bet_cent if betweenness > 0
    )
    highest_bet_cent = sorted_bet_cent[-1]
    scaling_factor = 3500 / highest_bet_cent

    normalized_node_params = [
        (betweenness or (lowest_non_zero_bet_cent / 10)) * scaling_factor
        for betweenness in sorted_bet_cent
    ]

    pos = nx.spring_layout(graph)
    plt.figure(figsize=(12, 8))
    nx.draw_networkx(
        graph,
        pos=pos,
        with_labels=False,
        edge_color="dimgray",
        node_color=normalized_node_params,
        node_size=normalized_node_params,
    )
    plt.axis("off")

In [None]:
def plot_betweenness_centrality_descriptive_stats(
    bet_cent: Dict[str, float], color: str
) -> None:
    bet_cent_list = get_betweenness_centrality_list(bet_cent)

    plt.figure(figsize=(12, 3))
    plt.boxplot(bet_cent_list, vert=False, flierprops=dict(markerfacecolor=color))
    plt.xlabel("Betweenness Centrality")

In [None]:
def plot_betweenness_centrality_ranking(
    bet_cent: Dict[str, float], fmt: str, stop_at: float = 0
) -> None:
    sorted_bet_cent = sorted(get_betweenness_centrality_list(bet_cent), reverse=True)

    if stop_at:
        stop_at_index = sorted_bet_cent.index(stop_at)
        while sorted_bet_cent[stop_at_index] == sorted_bet_cent[stop_at_index + 1]:
            stop_at_index += 1

    adjusted_list = [None]
    adjusted_list.extend(
        sorted_bet_cent[: stop_at_index + 1] if stop_at else sorted_bet_cent
    )

    plt.figure(figsize=(12, 3))
    plt.plot(adjusted_list, fmt)
    plt.xlabel("Rank")
    plt.ylabel("Betweenness Centrality")

In [None]:
def group_nodes_by_betweenness_centrality(
    bet_cent: Dict[str, float], highest_first: bool = True
) -> Dict[float, List[str]]:
    bet_cent_list = get_betweenness_centrality_list(bet_cent)
    unique_bet_cent = sorted(np.unique(bet_cent_list), reverse=highest_first)
    sorted_bet_cent_dict = {}
    for value in unique_bet_cent:
        sorted_bet_cent_dict[value] = sorted(
            [node for node, bc in bet_cent.items() if bc == value]
        )
    return sorted_bet_cent_dict

In [None]:
def get_betweenness_centrality_critical_nodes_for_count(
    bet_cent: Dict[str, float], target_node_count
) -> Tuple[Dict[int, List[str]], float]:
    node_count = 0
    bc_groups = group_nodes_by_betweenness_centrality(bet_cent)

    group_count = 0
    while node_count < target_node_count:
        group_count += 1
        highest_bc = dict(itertools.islice(bc_groups.items(), group_count))
        node_count = sum([len(nodes) for _, nodes in highest_bc.items()])

    print(f"{group_count}-critical-groups node count: {node_count}\n")

    return highest_bc, node_count

### 3.3. Load the anonymized graphs from CSV and cache their Betweenness Centrality data

In [None]:
anon_data_folder = "../data/anonymized"

small_graph_1 = load_graph_from_csv(f"{anon_data_folder}/anon-dataset-small_1.csv")
print(f'{format_graph_info("small(1)", small_graph_1)}\n')

small_graph_2 = load_graph_from_csv(f"{anon_data_folder}/anon-dataset-small_2.csv")
print(f'{format_graph_info("small(2)", small_graph_2)}\n')

medium_graph = load_graph_from_csv(f"{anon_data_folder}/anon-dataset-medium.csv")
print(f'{format_graph_info("medium", medium_graph)}\n')

large_graph = load_graph_from_csv(f"{anon_data_folder}/anon-dataset-large.csv")
print(f'{format_graph_info("large", large_graph)}')

In [None]:
small_graph_1_bc = get_betweenness_centrality(small_graph_1)
small_graph_2_bc = get_betweenness_centrality(small_graph_2)
medium_graph_bc = get_betweenness_centrality(medium_graph)
large_graph_bc = get_betweenness_centrality(large_graph)

### 3.4. Draw the networks, plot Betweenness Centrality data, etc

#### 3.4.1. Small network 1

In [None]:
draw_betweenness_centrality_based_network(small_graph_1, small_graph_1_bc)

In [None]:
sg_1_bet_cent_series = get_betweenness_centrality_series(small_graph_1_bc)
print(sg_1_bet_cent_series.describe())

In [None]:
plot_betweenness_centrality_descriptive_stats(small_graph_1_bc, "red")

Like what was observed in the out-degree analysis, there are few nodes with high betweenness centrality in each of the four datasets, as summarized by the above descriptive statistics and box plot chart. Again, the research shows no clear threshold that can be used to determine the key assets from the betweenness centrality perspective.

However, ranking the assets according to their betweenness centralities makes it feasible to point out the outliers, which Data Governance folks should pay special attention to.

In [None]:
plot_betweenness_centrality_ranking(small_graph_1_bc, "ro-")

In [None]:
sg_1_critical_groups, _ = get_betweenness_centrality_critical_nodes_for_count(
    small_graph_1_bc, 10
)
print(f"\n{sg_1_critical_groups}")

In [None]:
plot_betweenness_centrality_ranking(
    small_graph_1_bc,
    "ro-",
    stop_at=list(sg_1_critical_groups.keys())[-1],
)

#### 3.4.2. Small network 2

In [None]:
draw_betweenness_centrality_based_network(small_graph_2, small_graph_2_bc)

In [None]:
sg_2_bet_cent_series = get_betweenness_centrality_series(small_graph_2_bc)
print(sg_2_bet_cent_series.describe())

In [None]:
plot_betweenness_centrality_descriptive_stats(small_graph_2_bc, "blue")

In [None]:
plot_betweenness_centrality_ranking(small_graph_2_bc, "bo-")

In [None]:
sg_2_critical_groups, _ = get_betweenness_centrality_critical_nodes_for_count(
    small_graph_2_bc, 10
)
print(f"\n{sg_2_critical_groups}")

In [None]:
plot_betweenness_centrality_ranking(
    small_graph_2_bc,
    "bo-",
    stop_at=list(sg_2_critical_groups.keys())[-1],
)

#### 3.4.3. Medium network

In [None]:
draw_betweenness_centrality_based_network(medium_graph, medium_graph_bc)

In [None]:
mg_bet_cent_series = get_betweenness_centrality_series(medium_graph_bc)
print(mg_bet_cent_series.describe())

In [None]:
plot_betweenness_centrality_descriptive_stats(medium_graph_bc, "green")

In [None]:
plot_betweenness_centrality_ranking(medium_graph_bc, "go-")

In [None]:
mg_critical_groups, _ = get_betweenness_centrality_critical_nodes_for_count(
    medium_graph_bc, 10
)
print(f"\n{mg_critical_groups}")

In [None]:
plot_betweenness_centrality_ranking(
    medium_graph_bc,
    "go-",
    stop_at=list(mg_critical_groups.keys())[-1],
)

#### 3.4.4. Large network

In [None]:
draw_betweenness_centrality_based_network(large_graph, large_graph_bc)

In [None]:
lg_bet_cent_series = get_betweenness_centrality_series(large_graph_bc)
print(lg_bet_cent_series.describe())

In [None]:
plot_betweenness_centrality_descriptive_stats(large_graph_bc, "magenta")

In [None]:
plot_betweenness_centrality_ranking(large_graph_bc, "mo-")

In [None]:
lg_critical_groups, _ = get_betweenness_centrality_critical_nodes_for_count(
    large_graph_bc, 10
)
print(f"\n{lg_critical_groups}")

In [None]:
plot_betweenness_centrality_ranking(
    large_graph_bc,
    "mo-",
    stop_at=list(lg_critical_groups.keys())[-1],
)