# Social Network Analysis for Data Warehouses

## 1. Import dependencies

In [None]:
import matplotlib.pyplot as plt
import networkx as nx

## 2. Load the graphs with anonymous table names

In [None]:
small_1_graph = nx.convert_node_labels_to_integers(
    nx.read_edgelist(
        "../data/prepared/prepared-dataset-small.csv",
        delimiter=',', create_using=nx.DiGraph()),
    first_label=10001)

small_2_graph = nx.convert_node_labels_to_integers(
    nx.read_edgelist(
        "../data/prepared/prepared-dataset-medium_1.csv",
        delimiter=',', create_using=nx.DiGraph()),
    first_label=20001)

medium_graph = nx.convert_node_labels_to_integers(
    nx.read_edgelist(
        "../data/prepared/prepared-dataset-medium_2.csv",
        delimiter=',', create_using=nx.DiGraph()),
    first_label=300001)

large_graph = nx.convert_node_labels_to_integers(
    nx.read_edgelist(
        "../data/prepared/prepared-dataset-large.csv",
        delimiter=',', create_using=nx.DiGraph()),
    first_label=4000001)

## 3. Optionally store the anonymized datasets as CSV

In [None]:
nx.write_edgelist(small_1_graph, "../data/anonymized/anon-dataset-small_1.csv",
    delimiter=',', data=False)

nx.write_edgelist(small_2_graph, "../data/anonymized/anon-dataset-small_2.csv",
    delimiter=',', data=False)

nx.write_edgelist(medium_graph, "../data/anonymized/anon-dataset-medium.csv",
    delimiter=',', data=False)

nx.write_edgelist(large_graph, "../data/anonymized/anon-dataset-large.csv",
    delimiter=',', data=False)

## 4. Nodes and edges info

In [None]:
print(
    f"SMALL(1) DATASET INFO:\n"
    f"  Number of nodes: {nx.number_of_nodes(small_1_graph)}\n"
    f"  Number of edges: {nx.number_of_edges(small_1_graph)}\n"
)

print(
    f"SMALL(2) DATASET INFO:\n"
    f"  Number of nodes: {nx.number_of_nodes(small_2_graph)}\n"
    f"  Number of edges: {nx.number_of_edges(small_2_graph)}\n"
)

print(
    f"MEDIUM DATASET INFO:\n"
    f"  Number of nodes: {nx.number_of_nodes(medium_graph)}\n"
    f"  Number of edges: {nx.number_of_edges(medium_graph)}\n"
)

print(
    f"LARGE DATASET INFO:\n"
    f"  Number of nodes: {nx.number_of_nodes(large_graph)}\n"
    f"  Number of edges: {nx.number_of_edges(large_graph)}"
)

## Network connectivity: Clustering Coefficients

In [None]:
# Network connectivity: Clustering Coefficients

print(
    f"SMALL(1) DATASET INFO:\n"
    f"  Average clustering coefficient: {nx.average_clustering(small_1_graph)}\n"
)

print(
    f"SMALL(2) DATASET INFO:\n"
    f"  Average clustering coefficient: {nx.average_clustering(small_2_graph)}\n"
)

print(
    f"MEDIUM DATASET INFO:\n"
    f"  Average clustering coefficient: {nx.average_clustering(medium_graph)}\n"
)

print(
    f"LARGE DATASET INFO:\n"
    f"  Average clustering coefficient: {nx.average_clustering(large_graph)}"
)

## Degree histograms

In [None]:
def degree_histogram_directed(graph, in_degree=False, out_degree=False):
    """Return a list of the frequency of each degree value.

    Parameters
    ----------
    graph: Networkx graph
       A graph
    in_degree: bool
    out_degree : bool

    Returns
    -------
    hist : list
       A list of frequencies of degrees.
       The degree values are the index in the list.

    Notes
    -----
    Note: the bins are width one, hence len(list) can be large
    (Order(number_of_edges))
    """
    nodes = G.nodes()
    if in_degree:
        in_degree = dict(G.in_degree())
        degseq = [in_degree.get(k, 0) for k in nodes]
    elif out_degree:
        out_degree = dict(G.out_degree())
        degseq = [out_degree.get(k, 0) for k in nodes]
    else:
        degseq = [v for k, v in G.degree()]
    dmax = max(degseq) + 1
    freq = [0 for d in range(dmax)]
    for d in degseq:
        freq[d] += 1
    return freq

In [None]:
graph = small_1_graph

in_degree_freq = degree_histogram_directed(graph, in_degree=True)
out_degree_freq = degree_histogram_directed(graph, out_degree=True)

print(in_degree_freq[1:])
print(out_degree_freq[1:])

plt.figure(figsize=(12, 4)) 
plt.plot(range(len(in_degree_freq[1:])), in_degree_freq[1:], "go-")
plt.plot(range(len(out_degree_freq[1:])), out_degree_freq[1:], "bo-")

plt.xlabel("Degree")
plt.ylabel("Frequency")

In [None]:
graph = small_2_graph

in_degree_freq = degree_histogram_directed(graph, in_degree=True)
out_degree_freq = degree_histogram_directed(graph, out_degree=True)

print(in_degree_freq[1:])
print(out_degree_freq[1:])

plt.figure(figsize=(12, 4)) 
plt.plot(range(len(in_degree_freq[1:])), in_degree_freq[1:], "go-")
plt.plot(range(len(out_degree_freq[1:])), out_degree_freq[1:], "bo-")

plt.xlabel("Degree")
plt.ylabel("Frequency")

In [None]:
graph = medium_graph

in_degree_freq = degree_histogram_directed(graph, in_degree=True)
out_degree_freq = degree_histogram_directed(graph, out_degree=True)

print(in_degree_freq[1:])
print(out_degree_freq[1:])

plt.figure(figsize=(12, 4)) 
plt.plot(range(len(in_degree_freq[1:])), in_degree_freq[1:], "go-")
plt.plot(range(len(out_degree_freq[1:])), out_degree_freq[1:], "bo-")

plt.xlabel("Degree")
plt.ylabel("Frequency")

In [None]:
graph = large_graph

in_degree_freq = degree_histogram_directed(graph, in_degree=True)
out_degree_freq = degree_histogram_directed(graph, out_degree=True)

print(in_degree_freq[1:])
print(out_degree_freq[1:])

plt.figure(figsize=(12, 4)) 
plt.plot(range(len(in_degree_freq[1:])), in_degree_freq[1:], "go-")
plt.plot(range(len(out_degree_freq[1:])), out_degree_freq[1:], "bo-")

plt.xlabel("Degree")
plt.ylabel("Frequency")

## Draw the networks

In [None]:
graph = small_1_graph

pos = nx.spring_layout(graph)
bet_cent = nx.betweenness_centrality(graph, normalized=True, endpoints=True)
node_color = [20000 * graph.degree(node_id) for node_id in graph]
node_size = [node_bc * 1000000 for node_bc in bet_cent.values()]
plt.figure(figsize=(12, 8))
nx.draw_networkx(graph, pos=pos, with_labels=False,
                 edge_color="dimgray",
                 node_color=node_color,
                 node_size=node_size)
plt.axis('off')

In [None]:
graph = small_2_graph

pos = nx.spring_layout(graph)
bet_cent = nx.betweenness_centrality(graph, normalized=True, endpoints=True)
node_color = [20000 * graph.degree(node_id) for node_id in graph]
node_size = [node_bc * 1000000 for node_bc in bet_cent.values()]
plt.figure(figsize=(12, 8))
nx.draw_networkx(graph, pos=pos, with_labels=False,
                 edge_color="dimgray",
                 node_color=node_color,
                 node_size=node_size)
plt.axis('off')

## Other metrics

In [None]:
# Network Connectivity: degree

# Degree of a node defines the number of connections a node has.
# NetworkX has the function degree which we can use to determine the degree of a node in the network.

nx.degree(graph, node_id)

In [None]:
# Network Influencers: betweenness centrality

bet_cent = nx.betweenness_centrality(graph, normalized=True, endpoints=True)

# Print the labels of the nodes with the highest betweenness centrality.
sorted(bet_cent, key=bet_cent.get, reverse=True)[:5]