# Social Network Analysis for Data Warehouses

## 1. Import dependencies

In [None]:
from typing import List

import matplotlib.pyplot as plt
import networkx as nx
from networkx.classes.graph import Graph

## 2. Load the graphs with anonymous table names

_This step is intended to run only once, so it is commented out to avoid misuse._

`nx.convert_node_labels_to_integers()`, used to anonymize the table names, is not guaranteed to assign the same integer value to a given node label every time it runs; thus, the results from the one-time run are persisted in the next step and should be pushed to the Git repo to generate consistent/reproducible research results in later notebook runs.

In [None]:
# prepared_data_folder = "../data/prepared"

# small_graph_1 = nx.convert_node_labels_to_integers(
#     nx.read_edgelist(
#         f"{prepared_data_folder}/prepared-dataset-small.csv",
#         delimiter=",", create_using=nx.DiGraph()),
#     first_label=10001)

# small_graph_2 = nx.convert_node_labels_to_integers(
#     nx.read_edgelist(
#         f"{prepared_data_folder}/prepared-dataset-medium_1.csv",
#         delimiter=",", create_using=nx.DiGraph()),
#     first_label=20001)

# medium_graph = nx.convert_node_labels_to_integers(
#     nx.read_edgelist(
#         f"{prepared_data_folder}/prepared-dataset-medium_2.csv",
#         delimiter=",", create_using=nx.DiGraph()),
#     first_label=300001)

# large_graph = nx.convert_node_labels_to_integers(
#     nx.read_edgelist(
#         f"{prepared_data_folder}/prepared-dataset-large.csv",
#         delimiter=",", create_using=nx.DiGraph()),
#     first_label=4000001)

## 3. Store the anonymized graphs as CSV

_This step is intended to run only once, so it is commented out to avoid misuse._

In [None]:
# anon_data_folder = "../data/anonymized"

# nx.write_edgelist(small_graph_1,
#     f"{anon_data_folder}/anon-dataset-small_1.csv",
#     delimiter=",", data=False)

# nx.write_edgelist(small_graph_2,
#     f"{anon_data_folder}/anon-dataset-small_2.csv",
#     delimiter=",", data=False)

# nx.write_edgelist(medium_graph,
#     f"{anon_data_folder}/anon-dataset-medium.csv",
#     delimiter=",", data=False)

# nx.write_edgelist(large_graph,
#     f"{anon_data_folder}/anon-dataset-large.csv",
#     delimiter=",", data=False)

## 4. Global utility functions

In [None]:
def load_graph_from_csv(file: str) -> Graph:
    return nx.read_edgelist(
        file, delimiter=",", create_using=nx.DiGraph)

In [None]:
def format_graph_info(graph_id: str, graph: Graph) -> str:
    return (
        f"{graph_id.upper()} GRAPH INFO:\n"
        f"  Number of nodes: {nx.number_of_nodes(graph)}\n"
        f"  Number of edges: {nx.number_of_edges(graph)}\n"
        f"  Average clustering coefficient: {nx.average_clustering(graph)}"
    )

In [None]:
def draw_network(graph: Graph) -> None:
    """
    Parameters
    ----------
    graph: Networkx graph
       A graph
    """
    pos = nx.spring_layout(graph)
    bet_cent = nx.betweenness_centrality(graph, normalized=True, endpoints=True)
    node_color = [20000 * graph.degree(node_id) for node_id in graph]
    node_size = [node_bc * 1000000 for node_bc in bet_cent.values()]
    plt.figure(figsize=(12, 8))
    nx.draw_networkx(graph, pos=pos, with_labels=False,
                     edge_color="dimgray",
                     node_color=node_color,
                     node_size=node_size)
    plt.axis('off')

## 5. Load the anonymized graphs from CSV

In [None]:
anon_data_folder = "../data/anonymized"

small_graph_1 = load_graph_from_csv(
    f"{anon_data_folder}/anon-dataset-small_1.csv")
print(f'{format_graph_info("small(1)", small_graph_1)}\n')

small_graph_2 = load_graph_from_csv(
    f"{anon_data_folder}/anon-dataset-small_2.csv")
print(f'{format_graph_info("small(2)", small_graph_2)}\n')

medium_graph = load_graph_from_csv(
    f"{anon_data_folder}/anon-dataset-medium.csv")
print(f'{format_graph_info("medium", medium_graph)}\n')

large_graph = load_graph_from_csv(
    f"{anon_data_folder}/anon-dataset-large.csv")
print(f'{format_graph_info("large", large_graph)}')

## Draw the initial networks

In [None]:
draw_network(small_graph_1)

In [None]:
draw_network(small_graph_2)

## Network Connectivity Analysis: Degree

Degree of a node defines the number of connections a node has.

### Utility functions

In [None]:
def get_out_degree_greater_than_zero_subgraph(graph: Graph) -> Graph:
    """
    Parameters
    ----------
    graph: Networkx graph
       A graph
    """
    out_degrees = graph.out_degree()
    nodes_to_keep = [
        node for node, out_degree in out_degrees if out_degree > 0
    ]
    return graph.subgraph(nodes_to_keep)

In [None]:
def make_degree_histogram(
        graph: Graph, in_degree: bool=False, out_degree: bool=False
    ) -> List:
    """Return a list of the frequency of each degree value.

    Parameters
    ----------
    graph: Networkx graph
       A graph
    in_degree: bool
    out_degree : bool

    Returns
    -------
    histogram : list
       A list of frequencies of degrees.
       The degree values are the index in the list.

    Notes
    -----
    Note: the bins are width one, hence len(list) can be large
    (Order(number_of_edges))
    """
    nodes = graph.nodes()
    if in_degree:
        in_degree = dict(graph.in_degree())
        degree_list = [in_degree.get(node, 0) for node in nodes]
    elif out_degree:
        out_degree = dict(graph.out_degree())
        degree_list = [out_degree.get(node, 0) for node in nodes]
    else:
        degree_list = [degree for node, degree in graph.degree()]
    max_degree = max(degree_list) + 1
    histogram = [0 for _ in range(max_degree)]
    for degree in degree_list:
        histogram[degree] += 1
    return histogram

In [None]:
def print_out_degree_histogram(graph: Graph) -> None:
    out_degree_histogram = make_degree_histogram(graph, out_degree=True)
    print(out_degree_histogram[1:])

    plt.figure(figsize=(12, 4)) 
    plt.plot(
        range(len(out_degree_histogram[1:])), out_degree_histogram[1:], "bo-"
    )

    plt.xlabel("Degree")
    plt.ylabel("Frequency")

### Out-degree > 0 subgraphs

In [None]:
small_graph_out_degrees_1 = \
    get_out_degree_greater_than_zero_subgraph(small_graph_1)
print(f'{format_graph_info("small(1)", small_graph_out_degrees_1)}\n')

small_graph_out_degrees_2 = \
    get_out_degree_greater_than_zero_subgraph(small_graph_2)
print(f'{format_graph_info("small(2)", small_graph_out_degrees_2)}\n')

medium_graph_out_degrees = \
    get_out_degree_greater_than_zero_subgraph(medium_graph)
print(f'{format_graph_info("medium", medium_graph_out_degrees)}\n')

large_graph_out_degrees = \
    get_out_degree_greater_than_zero_subgraph(large_graph)
print(f'{format_graph_info("large", large_graph_out_degrees)}')

### Draw the networks and degree histograms

In [None]:
draw_network(small_graph_out_degrees_1)

In [None]:
print_out_degree_histogram(small_graph_out_degrees_1)

In [None]:
draw_network(small_graph_out_degrees_2)

In [None]:
print_out_degree_histogram(small_graph_out_degrees_2)

In [None]:
draw_network(medium_graph_out_degrees)

In [None]:
print_out_degree_histogram(medium_graph_out_degrees)

In [None]:
draw_network(large_graph_out_degrees)

In [None]:
print_out_degree_histogram(large_graph_out_degrees)

## Other metrics

In [None]:
# Network Influencers: betweenness centrality

bet_cent = nx.betweenness_centrality(graph, normalized=True, endpoints=True)

# Print the labels of the nodes with the highest betweenness centrality.
sorted(bet_cent, key=bet_cent.get, reverse=True)[:5]