# Social Network Analysis for Data Warehouses

## 1. Import dependencies

In [None]:
import itertools
from typing import Dict, List

import matplotlib.pyplot as plt
import networkx as nx
from networkx.classes.graph import Graph
import numpy as np

## 2. Load the graphs with anonymous table names

_This step is intended to run only once, so it is commented out to avoid misuse._

`nx.convert_node_labels_to_integers()`, used to anonymize the table names, is not guaranteed to assign the same integer value to a given node label every time it runs; thus, the results from the one-time run are persisted in the next step and should be pushed to the Git repo to generate consistent/reproducible research results in later notebook runs.

In [None]:
# prepared_data_folder = "../data/prepared"

# small_graph_1 = nx.convert_node_labels_to_integers(
#     nx.read_edgelist(
#         f"{prepared_data_folder}/prepared-dataset-small.csv",
#         delimiter=",", create_using=nx.DiGraph()),
#     first_label=10001)

# small_graph_2 = nx.convert_node_labels_to_integers(
#     nx.read_edgelist(
#         f"{prepared_data_folder}/prepared-dataset-medium_1.csv",
#         delimiter=",", create_using=nx.DiGraph()),
#     first_label=20001)

# medium_graph = nx.convert_node_labels_to_integers(
#     nx.read_edgelist(
#         f"{prepared_data_folder}/prepared-dataset-medium_2.csv",
#         delimiter=",", create_using=nx.DiGraph()),
#     first_label=300001)

# large_graph = nx.convert_node_labels_to_integers(
#     nx.read_edgelist(
#         f"{prepared_data_folder}/prepared-dataset-large.csv",
#         delimiter=",", create_using=nx.DiGraph()),
#     first_label=4000001)

## 3. Store the anonymized graphs as CSV

_This step is intended to run only once, so it is commented out to avoid misuse._

In [None]:
# anon_data_folder = "../data/anonymized"

# nx.write_edgelist(small_graph_1,
#     f"{anon_data_folder}/anon-dataset-small_1.csv",
#     delimiter=",", data=False)

# nx.write_edgelist(small_graph_2,
#     f"{anon_data_folder}/anon-dataset-small_2.csv",
#     delimiter=",", data=False)

# nx.write_edgelist(medium_graph,
#     f"{anon_data_folder}/anon-dataset-medium.csv",
#     delimiter=",", data=False)

# nx.write_edgelist(large_graph,
#     f"{anon_data_folder}/anon-dataset-large.csv",
#     delimiter=",", data=False)

## 4. Global utility functions

In [None]:
def load_graph_from_csv(file: str) -> Graph:
    return nx.read_edgelist(
        file, delimiter=",", create_using=nx.DiGraph)

In [None]:
def format_graph_info(graph_id: str, graph: Graph) -> str:
    return (
        f"{graph_id.upper()} GRAPH INFO:\n"
        f"  Number of nodes: {nx.number_of_nodes(graph)}\n"
        f"  Number of edges: {nx.number_of_edges(graph)}\n"
        f"  Average clustering coefficient: {nx.average_clustering(graph)}"
    )

In [None]:
def draw_network(graph: Graph) -> None:
    pos = nx.spring_layout(graph)
    bet_cent = nx.betweenness_centrality(graph, normalized=True, endpoints=True)
    node_color = [20000 * graph.degree(node_id) for node_id in graph]
    node_size = [node_bc * 1000000 for node_bc in bet_cent.values()]
    plt.figure(figsize=(12, 8))
    nx.draw_networkx(graph, pos=pos, with_labels=False,
                     edge_color="dimgray",
                     node_color=node_color,
                     node_size=node_size)
    plt.axis('off')

## 5. Load the anonymized graphs from CSV

In [None]:
anon_data_folder = "../data/anonymized"

small_graph_1 = load_graph_from_csv(
    f"{anon_data_folder}/anon-dataset-small_1.csv")
print(f'{format_graph_info("small(1)", small_graph_1)}\n')

small_graph_2 = load_graph_from_csv(
    f"{anon_data_folder}/anon-dataset-small_2.csv")
print(f'{format_graph_info("small(2)", small_graph_2)}\n')

medium_graph = load_graph_from_csv(
    f"{anon_data_folder}/anon-dataset-medium.csv")
print(f'{format_graph_info("medium", medium_graph)}\n')

large_graph = load_graph_from_csv(
    f"{anon_data_folder}/anon-dataset-large.csv")
print(f'{format_graph_info("large", large_graph)}')

## Draw the initial networks

In [None]:
draw_network(small_graph_1)

In [None]:
draw_network(small_graph_2)

## Network Connectivity Analysis: Degree

Degree defines the number of connections a node has.

### Utility functions

In [None]:
def get_out_degree_list(graph: Graph) -> List[int]:
     return [degree for _, degree in graph.out_degree]

In [None]:
def print_out_degree_histogram(
    graph: Graph, color: str, start_from_degree: int=0
) -> None:
    degree_list = get_out_degree_list(graph)    
    unique_degrees, counts = np.unique(degree_list, return_counts=True)
    
    plt.figure(figsize=(12, 3)) 
    plt.bar(
        unique_degrees[start_from_degree:],
        counts[start_from_degree:],
        color=color
    )
    plt.xlabel("Out-degree")
    plt.ylabel("# of Nodes")

In [None]:
def print_out_degree_rank(
    graph: Graph, fmt: str, stop_at_degree: int=0
) -> None:
    degree_list = sorted(get_out_degree_list(graph), reverse=True)
    
    plt.figure(figsize=(12, 3)) 
    plt.plot(
        degree_list[:stop_at_degree] if stop_at_degree else degree_list,
        fmt
    )
    plt.xlabel("Rank")
    plt.ylabel("Out-degree")

In [None]:
def group_nodes_by_out_degree(graph: Graph) -> Dict[int, List[str]]:
    out_degree_list = get_out_degree_list(graph)
    degrees_dict = {}
    reverse_unique_degree_list = sorted(np.unique(out_degree_list), reverse=True)
    for unique_degree in reverse_unique_degree_list:
        degrees_dict[int(unique_degree)] = sorted([
            node for node, degree in graph.out_degree if degree == unique_degree
        ])
    return degrees_dict

In [None]:
def get_highest_out_degree_groups(
    graph: Graph, count: int=10
) -> Dict[int, List[str]]:
    out_degree_groups = group_nodes_by_out_degree(graph)
    highest_degrees = dict(itertools.islice(out_degree_groups.items(), count))

    degree_sum = sum(get_out_degree_list(graph))
    highest_degree_sum = sum([
        degree * len(nodes) for degree, nodes in highest_degrees.items()
    ])
    highest_total_degree_ratio = highest_degree_sum / degree_sum * 100
    print(f"Highest/total out-degree ratio: {highest_total_degree_ratio:.0f}%"
          f" ({highest_degree_sum}/{degree_sum})")
    
    return highest_degrees

### Draw the networks, degree histograms, and ranks

In [None]:
draw_network(small_graph_1)

In [None]:
print_out_degree_histogram(small_graph_1, "blue")

In [None]:
print_out_degree_histogram(small_graph_1, "blue", start_from_degree=10)

In [None]:
print_out_degree_rank(small_graph_1, "bo-")

In [None]:
print_out_degree_rank(small_graph_1, "bo-", stop_at_degree=50)

In [None]:
highest_out_degree_groups = get_highest_out_degree_groups(small_graph_1)
print(f"\n{highest_out_degree_groups}")

In [None]:
draw_network(small_graph_2)

In [None]:
print_out_degree_histogram(small_graph_2, "red")

In [None]:
print_out_degree_histogram(small_graph_2, "red", start_from_degree=10)

In [None]:
print_out_degree_rank(small_graph_2, "ro-")

In [None]:
print_out_degree_rank(small_graph_2, "ro-", stop_at_degree=50)

In [None]:
highest_out_degree_groups = get_highest_out_degree_groups(small_graph_2)
print(f"\n{highest_out_degree_groups}")

In [None]:
draw_network(medium_graph)

In [None]:
print_out_degree_histogram(medium_graph, "cyan")

In [None]:
print_out_degree_histogram(medium_graph, "cyan", start_from_degree=10)

In [None]:
print_out_degree_rank(medium_graph, "co-")

In [None]:
print_out_degree_rank(medium_graph, "co-", stop_at_degree=50)

In [None]:
highest_out_degree_groups = get_highest_out_degree_groups(medium_graph)
print(f"\n{highest_out_degree_groups}")

In [None]:
draw_network(large_graph)

In [None]:
print_out_degree_histogram(large_graph, "magenta")

In [None]:
print_out_degree_histogram(large_graph, "magenta", start_from_degree=50)

In [None]:
print_out_degree_rank(large_graph, "mo-")

In [None]:
print_out_degree_rank(large_graph, "mo-", stop_at_degree=50)

In [None]:
highest_out_degree_groups = get_highest_out_degree_groups(large_graph)
print(f"\n{highest_out_degree_groups}")

## Other metrics

In [None]:
# Network Influencers: betweenness centrality

bet_cent = nx.betweenness_centrality(graph, normalized=True, endpoints=True)

# Print the labels of the nodes with the highest betweenness centrality.
sorted(bet_cent, key=bet_cent.get, reverse=True)[:5]