# Social Network Analysis for Data Warehouses

## 2. Network Connectivity Analysis: Out-Degree

**Degree** defines the number of connections a node has.

### 2.1. Import dependencies

In [None]:
import itertools
from typing import Dict, List, Tuple

import matplotlib.pyplot as plt
import networkx as nx
from networkx.classes.graph import Graph
import numpy as np

### 2.2. Utility functions

In [None]:
def load_graph_from_csv(file: str) -> Graph:
    return nx.read_edgelist(file, delimiter=",", create_using=nx.DiGraph)

In [None]:
def format_graph_info(graph_id: str, graph: Graph) -> str:
    return (
        f"{graph_id.upper()} GRAPH INFO:\n"
        f"  Number of nodes: {nx.number_of_nodes(graph)}\n"
        f"  Number of edges: {nx.number_of_edges(graph)}\n"
        f"  Density: {nx.density(graph)}\n"
        f"  Average clustering coefficient: {nx.average_clustering(graph)}\n"
        f"  Transitivity: {nx.transitivity(graph)}"
    )

In [None]:
def get_out_degrees(graph: Graph) -> List[int]:
    return [degree for _, degree in graph.out_degree]

In [None]:
def draw_out_degree_based_network(graph: Graph) -> None:
    degrees = get_out_degrees(graph)
    non_zero_degrees = [degree or 0.05 for degree in degrees]
    sorted_degrees = sorted(non_zero_degrees)
    node_color = [degree * 20000 for degree in sorted_degrees]
    node_size = [degree * 100 for degree in sorted_degrees]

    pos = nx.spring_layout(graph)
    plt.figure(figsize=(12, 8))
    nx.draw_networkx(
        graph,
        pos=pos,
        with_labels=False,
        edge_color="dimgray",
        node_color=node_color,
        node_size=node_size,
    )
    plt.axis("off")

In [None]:
def print_out_degree_histogram(
    graph: Graph, color: str, start_from_degree: int = 0
) -> None:
    degrees = get_out_degrees(graph)
    unique_degrees, counts = np.unique(degrees, return_counts=True)

    start_from_index = 0
    if start_from_degree:
        start_from_index = np.where(unique_degrees == start_from_degree)[0][0]

    plt.figure(figsize=(12, 3))
    plt.bar(unique_degrees[start_from_index:], counts[start_from_index:], color=color)
    plt.xlabel("Out-degree")
    plt.ylabel("# of Nodes")

In [None]:
def print_out_degree_ranking(graph: Graph, fmt: str, stop_at_degree: int = 0) -> None:
    degrees = sorted(get_out_degrees(graph), reverse=True)

    if stop_at_degree:
        stop_at_index = degrees.index(stop_at_degree)
        while degrees[stop_at_index] == degrees[stop_at_index + 1]:
            stop_at_index += 1

    adjusted_list = [None]
    adjusted_list.extend(degrees[: stop_at_index + 1] if stop_at_degree else degrees)

    plt.figure(figsize=(12, 3))
    plt.plot(adjusted_list, fmt)
    plt.xlabel("Rank")
    plt.ylabel("Out-degree")

In [None]:
def group_nodes_by_out_degree(
    graph: Graph, highest_first: bool = True
) -> Dict[int, List[str]]:
    degrees = get_out_degrees(graph)
    degrees_dict = {}
    unique_degrees = sorted(np.unique(degrees), reverse=highest_first)
    for unique_degree in unique_degrees:
        degrees_dict[int(unique_degree)] = sorted(
            [node for node, degree in graph.out_degree if degree == unique_degree]
        )
    return degrees_dict

In [None]:
def get_out_degree_critical_nodes_for_count(
    graph: Graph, target_node_count: int = 10
) -> Tuple[Dict[int, List[str]], float]:
    node_count = 0
    degree_groups = group_nodes_by_out_degree(graph)
    degree_sum = sum(get_out_degrees(graph))

    group_count = 0
    while node_count < target_node_count:
        group_count += 1
        highest_degrees = dict(itertools.islice(degree_groups.items(), group_count))
        node_count = sum([len(nodes) for _, nodes in highest_degrees.items()])

    highest_degree_sum = sum(
        [degree * len(nodes) for degree, nodes in highest_degrees.items()]
    )
    groups_degree_ratio = highest_degree_sum / degree_sum * 100
    print(
        f"{group_count}-critical-groups node count: {node_count}\n"
        f"{group_count}-critical-groups/total out-degree ratio:"
        f" {groups_degree_ratio:.0f}% ({highest_degree_sum}/{degree_sum})"
    )

    return highest_degrees, node_count

In [None]:
def get_out_degree_critical_nodes_for_ratio(
    graph: Graph, target_ratio: float = 20.0
) -> Tuple[Dict[int, List[str]], float]:
    groups_degree_ratio = 0
    degree_groups = group_nodes_by_out_degree(graph)
    degree_sum = sum(get_out_degrees(graph))

    group_count = 0
    while groups_degree_ratio < target_ratio:
        group_count += 1
        highest_degrees = dict(itertools.islice(degree_groups.items(), group_count))
        highest_degree_sum = sum(
            [degree * len(nodes) for degree, nodes in highest_degrees.items()]
        )
        groups_degree_ratio = highest_degree_sum / degree_sum * 100

    node_count = sum([len(nodes) for _, nodes in highest_degrees.items()])

    print(
        f"{group_count}-critical-groups node count: {node_count}\n"
        f"{group_count}-critical-groups/total out-degree ratio:"
        f" {groups_degree_ratio:.0f}% ({highest_degree_sum}/{degree_sum})"
    )

    return highest_degrees, groups_degree_ratio

### 2.3. Load the anonymized graphs from CSV

In [None]:
anon_data_folder = "../data/anonymized"

small_graph_1 = load_graph_from_csv(f"{anon_data_folder}/anon-dataset-small_1.csv")
print(f'{format_graph_info("small(1)", small_graph_1)}\n')

small_graph_2 = load_graph_from_csv(f"{anon_data_folder}/anon-dataset-small_2.csv")
print(f'{format_graph_info("small(2)", small_graph_2)}\n')

medium_graph = load_graph_from_csv(f"{anon_data_folder}/anon-dataset-medium.csv")
print(f'{format_graph_info("medium", medium_graph)}\n')

large_graph = load_graph_from_csv(f"{anon_data_folder}/anon-dataset-large.csv")
print(f'{format_graph_info("large", large_graph)}')

### 2.4. Draw the networks, degree histograms, and rankings

#### 2.4.1. Small network 1

In [None]:
draw_out_degree_based_network(small_graph_1)

In [None]:
print_out_degree_histogram(small_graph_1, "blue")

In [None]:
print_out_degree_ranking(small_graph_1, "bo-")

In [None]:
critical_groups_sg1, _ = get_out_degree_critical_nodes_for_ratio(small_graph_1)
print(f"\n{critical_groups_sg1}")

less_critical_out_degree_sg1 = list(critical_groups_sg1)[-1]

In [None]:
print_out_degree_histogram(
    small_graph_1, "blue", start_from_degree=less_critical_out_degree_sg1
)

In [None]:
print_out_degree_ranking(
    small_graph_1, "bo-", stop_at_degree=less_critical_out_degree_sg1
)

In [None]:
# Only informative.

critical_groups_sg1, _ = get_out_degree_critical_nodes_for_count(small_graph_1)
print(f"\n{critical_groups_sg1}")

#### 2.4.2. Small network 2

In [None]:
draw_out_degree_based_network(small_graph_2)

In [None]:
print_out_degree_histogram(small_graph_2, "red")

In [None]:
print_out_degree_ranking(small_graph_2, "ro-")

In [None]:
critical_groups_sg2, _ = get_out_degree_critical_nodes_for_ratio(small_graph_2)
print(f"\n{critical_groups_sg2}")

less_critical_out_degree_sg2 = list(critical_groups_sg2)[-1]

In [None]:
print_out_degree_histogram(
    small_graph_2, "red", start_from_degree=less_critical_out_degree_sg2
)

In [None]:
print_out_degree_ranking(
    small_graph_2, "ro-", stop_at_degree=less_critical_out_degree_sg2
)

In [None]:
# Only informative.

critical_groups_sg2, _ = get_out_degree_critical_nodes_for_count(small_graph_2)
print(f"\n{critical_groups_sg2}")

#### 2.4.3. Medium network

In [None]:
draw_network(medium_graph)

In [None]:
print_out_degree_histogram(medium_graph, "cyan")

In [None]:
print_out_degree_ranking(medium_graph, "co-")

In [None]:
critical_groups_mg, _ = get_out_degree_critical_nodes_for_ratio(medium_graph)
print(f"\n{critical_groups_mg}")

less_critical_out_degree_mg = list(critical_groups_mg)[-1]

In [None]:
print_out_degree_histogram(
    medium_graph, "cyan", start_from_degree=less_critical_out_degree_mg
)

In [None]:
print_out_degree_ranking(
    medium_graph, "co-", stop_at_degree=less_critical_out_degree_mg
)

In [None]:
critical_groups_mg, _ = get_out_degree_critical_nodes_for_count(medium_graph, 15)
print(f"\n{critical_groups_mg}")

less_critical_out_degree_mg = list(critical_groups_mg)[-1]

In [None]:
print_out_degree_histogram(
    medium_graph, "cyan", start_from_degree=less_critical_out_degree_mg
)

In [None]:
print_out_degree_ranking(
    medium_graph, "co-", stop_at_degree=less_critical_out_degree_mg
)

#### 2.4.4. Large network

In [None]:
draw_network(large_graph)

In [None]:
print_out_degree_histogram(large_graph, "magenta")

In [None]:
print_out_degree_ranking(large_graph, "mo-")

In [None]:
critical_groups_lg, _ = get_out_degree_critical_nodes_for_ratio(large_graph)
print(f"\n{critical_groups_lg}")

less_critical_out_degree_lg = list(critical_groups_lg)[-1]

In [None]:
print_out_degree_histogram(
    large_graph, "magenta", start_from_degree=less_critical_out_degree_lg
)

In [None]:
print_out_degree_ranking(large_graph, "mo-", stop_at_degree=less_critical_out_degree_lg)

In [None]:
critical_groups_lg, _ = get_out_degree_critical_nodes_for_count(large_graph, 20)
print(f"\n{critical_groups_lg}")

less_critical_out_degree_lg = list(critical_groups_lg)[-1]

In [None]:
print_out_degree_histogram(
    large_graph, "magenta", start_from_degree=less_critical_out_degree_lg
)

In [None]:
print_out_degree_ranking(large_graph, "mo-", stop_at_degree=less_critical_out_degree_lg)