# Social Network Analysis applied to Data Warehouses

## 4. Network Connectivity Analysis: Clustering

The **Clustering Coefficient** measures the prevalence of tringles in an egocentric network. The Clustering Coefficient is the fraction of possible triangles tha

### 4.1. Import dependencies

In [None]:
from typing import Dict, List

import matplotlib.pyplot as plt
import networkx as nx
from networkx.classes.graph import Graph
import pandas as pd
from pandas import Series

### 4.2. Utility functions

In [None]:
def load_graph_from_csv(file: str) -> Graph:
    return nx.read_edgelist(file, delimiter=",", create_using=nx.DiGraph)

In [None]:
def format_graph_info(graph_id: str, graph: Graph) -> str:
    return (
        f"{graph_id.upper()} GRAPH INFO:\n"
        f"  Number of nodes: {nx.number_of_nodes(graph)}\n"
        f"  Number of edges: {nx.number_of_edges(graph)}\n"
        f"  Density: {nx.density(graph)}\n"
        f"  Average clustering coefficient: {nx.average_clustering(graph)}\n"
        f"  Transitivity: {nx.transitivity(graph)}"
    )

In [None]:
def get_clustering(graph: Graph) -> Dict[str, float]:
    return nx.clustering(graph)

In [None]:
def get_clustering_list(clustering: Dict[str, float]) -> List[float]:
    return [clust_coef for _, clust_coef in clustering.items()]

In [None]:
def get_clustering_series(clustering: Dict[str, float]) -> Series:
    return pd.Series(get_clustering_list(clustering))

In [None]:
def draw_clustering_based_network(graph: Graph, clustering: Dict[str, float]) -> None:
    sorted_clustering = sorted(get_clustering_list(clustering))

    lowest_non_zero_clust_coef = next(
        clust_coef for clust_coef in sorted_clustering if clust_coef > 0
    )
    highest_clust_coef = sorted_clustering[-1]
    scaling_factor = 3500 / highest_clust_coef

    normalized_node_params = [
        (clust_coef or (lowest_non_zero_clust_coef / 10)) * scaling_factor
        for clust_coef in sorted_clustering
    ]

    pos = nx.spring_layout(graph)
    plt.figure(figsize=(12, 8))
    nx.draw_networkx(
        graph,
        pos=pos,
        with_labels=False,
        edge_color="dimgray",
        node_color=normalized_node_params,
        node_size=normalized_node_params,
    )
    plt.axis("off")

In [None]:
def plot_clustering_descriptive_stats(clustering: Dict[str, float], color: str) -> None:
    clustering_list = get_clustering_list(clustering)

    plt.figure(figsize=(12, 3))
    plt.boxplot(clustering_list, vert=False, flierprops=dict(markerfacecolor=color))
    plt.xlabel("Clustering")

In [None]:
def plot_clustering_ranking(
    clustering: Dict[str, float], fmt: str, stop_at: float = 0
) -> None:
    sorted_clustering = sorted(get_clustering_list(clustering), reverse=True)

    if stop_at:
        stop_at_index = sorted_clustering.index(stop_at)
        while sorted_clustering[stop_at_index] == sorted_clustering[stop_at_index + 1]:
            stop_at_index += 1

    adjusted_list = [None]
    adjusted_list.extend(
        sorted_clustering[: stop_at_index + 1] if stop_at else sorted_clustering
    )

    plt.figure(figsize=(12, 3))
    plt.plot(adjusted_list, fmt)
    plt.xlabel("Rank")
    plt.ylabel("Clustering")

### 4.3. Load the anonymized graphs from CSV and cache their Clustering data

In [None]:
anon_data_folder = "../data/anonymized"

small_graph_1 = load_graph_from_csv(f"{anon_data_folder}/anon-dataset-small_1.csv")
print(f'{format_graph_info("small(1)", small_graph_1)}\n')

small_graph_2 = load_graph_from_csv(f"{anon_data_folder}/anon-dataset-small_2.csv")
print(f'{format_graph_info("small(2)", small_graph_2)}\n')

medium_graph = load_graph_from_csv(f"{anon_data_folder}/anon-dataset-medium.csv")
print(f'{format_graph_info("medium", medium_graph)}\n')

large_graph = load_graph_from_csv(f"{anon_data_folder}/anon-dataset-large.csv")
print(f'{format_graph_info("large", large_graph)}')

In [None]:
small_graph_1_cl = get_clustering(small_graph_1)
small_graph_2_cl = get_clustering(small_graph_2)
medium_graph_cl = get_clustering(medium_graph)
large_graph_cl = get_clustering(large_graph)

### 4.4. Draw the networks, plot Clustering data, etc

#### 4.4.1. Small network 1

In [None]:
draw_clustering_based_network(small_graph_1, small_graph_1_cl)

In [None]:
sg_1_clustering_series = get_clustering_series(small_graph_1_cl)
print(sg_1_clustering_series.describe())

In [None]:
plot_clustering_descriptive_stats(small_graph_1_cl, "blue")

In [None]:
plot_clustering_ranking(small_graph_1_cl, "bo-")

In [None]:
plot_clustering_ranking(
    small_graph_1_cl,
    "bo-",
    stop_at=sg_1_clustering_series.quantile(0.97, interpolation="nearest"),
)

#### 4.4.2. Small network 2

In [None]:
draw_clustering_based_network(small_graph_2, small_graph_2_cl)

In [None]:
sg_2_clustering_series = get_clustering_series(small_graph_2_cl)
print(sg_2_clustering_series.describe())

In [None]:
plot_clustering_descriptive_stats(small_graph_2_cl, "red")

In [None]:
plot_clustering_ranking(small_graph_2_cl, "ro-")

In [None]:
plot_clustering_ranking(
    small_graph_2_cl,
    "ro-",
    stop_at=sg_2_clustering_series.quantile(0.997, interpolation="nearest"),
)

#### 4.4.3. Medium network

In [None]:
draw_clustering_based_network(medium_graph, medium_graph_cl)

In [None]:
mg_clustering_series = get_clustering_series(medium_graph_cl)
print(mg_clustering_series.describe())

In [None]:
plot_clustering_descriptive_stats(medium_graph_cl, "green")

In [None]:
plot_clustering_ranking(medium_graph_cl, "go-")

In [None]:
plot_clustering_ranking(
    medium_graph_cl,
    "go-",
    stop_at=mg_clustering_series.quantile(0.992, interpolation="nearest"),
)

#### 4.4.4. Large network

In [None]:
draw_clustering_based_network(large_graph, large_graph_cl)

In [None]:
lg_clustering_series = get_clustering_series(large_graph_cl)
print(lg_cl_series.describe())

In [None]:
plot_clustering_descriptive_stats(large_graph_cl, "magenta")

In [None]:
plot_clustering_ranking(large_graph_cl, "mo-")

In [None]:
plot_clustering_ranking(
    large_graph_cl,
    "mo-",
    stop_at=lg_clustering_series.quantile(0.9989, interpolation="nearest"),
)