# Social Network Analysis for Data Warehouses

## 1. Import dependencies

In [None]:
import matplotlib.pyplot as plt
import networkx as nx

## 2. Load the graphs with anonymous table names

In [None]:
small_1_graph = nx.convert_node_labels_to_integers(
    nx.read_edgelist(
        "../data/prepared/prepared-dataset-small.csv",
        delimiter=',', create_using=nx.DiGraph()),
    first_label=10001)

small_2_graph = nx.convert_node_labels_to_integers(
    nx.read_edgelist(
        "../data/prepared/prepared-dataset-medium_1.csv",
        delimiter=',', create_using=nx.DiGraph()),
    first_label=20001)

medium_graph = nx.convert_node_labels_to_integers(
    nx.read_edgelist(
        "../data/prepared/prepared-dataset-medium_2.csv",
        delimiter=',', create_using=nx.DiGraph()),
    first_label=300001)

large_graph = nx.convert_node_labels_to_integers(
    nx.read_edgelist(
        "../data/prepared/prepared-dataset-large.csv",
        delimiter=',', create_using=nx.DiGraph()),
    first_label=4000001)

## 3. Optionally store the anonymized datasets as CSV

In [None]:
nx.write_edgelist(small_1_graph, "../data/anonymized/anon-dataset-small_1.csv",
    delimiter=',', data=False)

nx.write_edgelist(small_2_graph, "../data/anonymized/anon-dataset-small_2.csv",
    delimiter=',', data=False)

nx.write_edgelist(medium_graph, "../data/anonymized/anon-dataset-medium.csv",
    delimiter=',', data=False)

nx.write_edgelist(large_graph, "../data/anonymized/anon-dataset-large.csv",
    delimiter=',', data=False)

## 4. Nodes and edges info

In [None]:
print(
    f"SMALL(1) DATASET INFO:\n"
    f"  Number of nodes: {nx.number_of_nodes(small_1_graph)}\n"
    f"  Number of edges: {nx.number_of_edges(small_1_graph)}\n"
)

print(
    f"SMALL(2) DATASET INFO:\n"
    f"  Number of nodes: {nx.number_of_nodes(small_2_graph)}\n"
    f"  Number of edges: {nx.number_of_edges(small_2_graph)}\n"
)

print(
    f"MEDIUM DATASET INFO:\n"
    f"  Number of nodes: {nx.number_of_nodes(medium_graph)}\n"
    f"  Number of edges: {nx.number_of_edges(medium_graph)}\n"
)

print(
    f"LARGE DATASET INFO:\n"
    f"  Number of nodes: {nx.number_of_nodes(large_graph)}\n"
    f"  Number of edges: {nx.number_of_edges(large_graph)}"
)

## Network connectivity: clustering coefficient

In [None]:
# Network connectivity: clustering coefficients

print(
    f"SMALL(1) DATASET INFO:\n"
    f"  Average clustering coefficient: {nx.average_clustering(small_1_graph)}\n"
)

print(
    f"SMALL(2) DATASET INFO:\n"
    f"  Average clustering coefficient: {nx.average_clustering(small_2_graph)}\n"
)

print(
    f"MEDIUM DATASET INFO:\n"
    f"  Average clustering coefficient: {nx.average_clustering(medium_graph)}\n"
)

print(
    f"LARGE DATASET INFO:\n"
    f"  Average clustering coefficient: {nx.average_clustering(large_graph)}"
)

## Network Connectivity: degree

Degree of a node defines the number of connections a node has.

### Degree histograms

In [None]:
def make_degree_histogram(graph, in_degree=False, out_degree=False):
    """Return a list of the frequency of each degree value.

    Parameters
    ----------
    graph: Networkx graph
       A graph
    in_degree: bool
    out_degree : bool

    Returns
    -------
    histogram : list
       A list of frequencies of degrees.
       The degree values are the index in the list.

    Notes
    -----
    Note: the bins are width one, hence len(list) can be large
    (Order(number_of_edges))
    """
    nodes = graph.nodes()
    if in_degree:
        in_degree = dict(graph.in_degree())
        degree_list = [in_degree.get(node, 0) for node in nodes]
    elif out_degree:
        out_degree = dict(graph.out_degree())
        degree_list = [out_degree.get(node, 0) for node in nodes]
    else:
        degree_list = [degree for node, degree in graph.degree()]
    max_degree = max(degree_list) + 1
    histogram = [0 for _ in range(max_degree)]
    for degree in degree_list:
        histogram[degree] += 1
    return histogram

In [None]:
def print_out_degree_histogram(graph):
    out_degree_histogram = make_degree_histogram(graph, out_degree=True)
    print(out_degree_histogram[1:])

    plt.figure(figsize=(12, 4)) 
    plt.plot(
        range(len(out_degree_histogram[1:])), out_degree_histogram[1:], "bo-"
    )

    plt.xlabel("Degree")
    plt.ylabel("Frequency")

In [None]:
print_out_degree_histogram(small_1_graph)

In [None]:
print_out_degree_histogram(small_2_graph)

In [None]:
print_out_degree_histogram(medium_graph)

In [None]:
print_out_degree_histogram(large_graph)

## Draw the networks

In [None]:
def draw_network(graph):
    """
    Parameters
    ----------
    graph: Networkx graph
       A graph
    """
    pos = nx.spring_layout(graph)
    bet_cent = nx.betweenness_centrality(graph, normalized=True, endpoints=True)
    node_color = [20000 * graph.degree(node_id) for node_id in graph]
    node_size = [node_bc * 1000000 for node_bc in bet_cent.values()]
    plt.figure(figsize=(12, 8))
    nx.draw_networkx(graph, pos=pos, with_labels=False,
                     edge_color="dimgray",
                     node_color=node_color,
                     node_size=node_size)
    plt.axis('off')

In [None]:
draw_network(small_1_graph)

In [None]:
draw_network(small_2_graph)

## Subgraphs

### Out-degree > 0

In [None]:
def get_out_degree_greater_than_zero_subgrapgh(graph):
    """
    Parameters
    ----------
    graph: Networkx graph
       A graph
    """
    out_degrees = graph.out_degree()
    nodes_to_keep = [
        node for node, out_degree in out_degrees if out_degree > 0
    ]
    return graph.subgraph(nodes_to_keep)

In [None]:
small_1_graph_out_degrees = \
    get_out_degree_greater_than_zero_subgrapgh(small_1_graph)

small_2_graph_out_degrees = \
    get_out_degree_greater_than_zero_subgrapgh(small_2_graph)

medium_graph_out_degrees = \
    get_out_degree_greater_than_zero_subgrapgh(medium_graph)

large_graph_out_degrees = \
    get_out_degree_greater_than_zero_subgrapgh(large_graph)

print(
    f"SMALL(1) DATASET INFO:\n"
    f"  Number of nodes: {nx.number_of_nodes(small_1_graph_out_degrees)}\n"
    f"  Number of edges: {nx.number_of_edges(small_1_graph_out_degrees)}\n"
)

print(
    f"SMALL(2) DATASET INFO:\n"
    f"  Number of nodes: {nx.number_of_nodes(small_2_graph_out_degrees)}\n"
    f"  Number of edges: {nx.number_of_edges(small_2_graph_out_degrees)}\n"
)

print(
    f"MEDIUM DATASET INFO:\n"
    f"  Number of nodes: {nx.number_of_nodes(medium_graph_out_degrees)}\n"
    f"  Number of edges: {nx.number_of_edges(medium_graph_out_degrees)}\n"
)

print(
    f"LARGE DATASET INFO:\n"
    f"  Number of nodes: {nx.number_of_nodes(large_graph_out_degrees)}\n"
    f"  Number of edges: {nx.number_of_edges(large_graph_out_degrees)}\n"
)

# Network connectivity: clustering coefficients

print(
    f"SMALL(1) DATASET INFO:\n"
    f"  Average clustering coefficient:"
    f" {nx.average_clustering(small_1_graph_out_degrees)}\n"
)

print(
    f"SMALL(2) DATASET INFO:\n"
    f"  Average clustering coefficient:"
    f" {nx.average_clustering(small_2_graph_out_degrees)}\n"
)

print(
    f"MEDIUM DATASET INFO:\n"
    f"  Average clustering coefficient:"
    f" {nx.average_clustering(medium_graph_out_degrees)}\n"
)

print(
    f"LARGE DATASET INFO:\n"
    f"  Average clustering coefficient:"
    f" {nx.average_clustering(large_graph_out_degrees)}"
)

In [None]:
# Recursive option.

small_1_graph_out_degrees = \
    get_out_degree_greater_than_zero_subgrapgh(small_1_graph_out_degrees)

small_2_graph_out_degrees = \
    get_out_degree_greater_than_zero_subgrapgh(small_2_graph_out_degrees)

medium_graph_out_degrees = \
    get_out_degree_greater_than_zero_subgrapgh(medium_graph_out_degrees)

large_graph_out_degrees = \
    get_out_degree_greater_than_zero_subgrapgh(large_graph_out_degrees)

print(
    f"SMALL(1) DATASET INFO:\n"
    f"  Number of nodes: {nx.number_of_nodes(small_1_graph_out_degrees)}\n"
    f"  Number of edges: {nx.number_of_edges(small_1_graph_out_degrees)}\n"
)

print(
    f"SMALL(2) DATASET INFO:\n"
    f"  Number of nodes: {nx.number_of_nodes(small_2_graph_out_degrees)}\n"
    f"  Number of edges: {nx.number_of_edges(small_2_graph_out_degrees)}\n"
)

print(
    f"MEDIUM DATASET INFO:\n"
    f"  Number of nodes: {nx.number_of_nodes(medium_graph_out_degrees)}\n"
    f"  Number of edges: {nx.number_of_edges(medium_graph_out_degrees)}\n"
)

print(
    f"LARGE DATASET INFO:\n"
    f"  Number of nodes: {nx.number_of_nodes(large_graph_out_degrees)}\n"
    f"  Number of edges: {nx.number_of_edges(large_graph_out_degrees)}\n"
)

# Network connectivity: clustering coefficients

print(
    f"SMALL(1) DATASET INFO:\n"
    f"  Average clustering coefficient:"
    f" {nx.average_clustering(small_1_graph_out_degrees)}\n"
)

print(
    f"SMALL(2) DATASET INFO:\n"
    f"  Average clustering coefficient:"
    f" {nx.average_clustering(small_2_graph_out_degrees)}\n"
)

print(
    f"MEDIUM DATASET INFO:\n"
    f"  Average clustering coefficient:"
    f" {nx.average_clustering(medium_graph_out_degrees)}\n"
)

print(
    f"LARGE DATASET INFO:\n"
    f"  Average clustering coefficient:"
    f" {nx.average_clustering(large_graph_out_degrees)}"
)

In [None]:
print_out_degree_histogram(small_1_graph_out_degrees)

In [None]:
print_out_degree_histogram(small_2_graph_out_degrees)

In [None]:
print_out_degree_histogram(medium_graph_out_degrees)

In [None]:
print_out_degree_histogram(large_graph_out_degrees)

In [None]:
draw_network(small_1_graph_out_degrees)

In [None]:
draw_network(small_2_graph_out_degrees)

In [None]:
draw_network(medium_graph_out_degrees)

In [None]:
draw_network(large_graph_out_degrees)

## Other metrics

In [None]:
# Network Influencers: betweenness centrality

bet_cent = nx.betweenness_centrality(graph, normalized=True, endpoints=True)

# Print the labels of the nodes with the highest betweenness centrality.
sorted(bet_cent, key=bet_cent.get, reverse=True)[:5]