# Load data
First we load the data, I've pulled a [generic quotations dataset](https://huggingface.co/datasets/m-ric/english_historical_quotes/) from huggingface, it's downloaded to the repo for convenience.

In [None]:
from semnet import SemanticNetwork
import pandas as pd
import datasets
import sentence_transformers
from cosmograph import cosmo

dataset = datasets.load_dataset(
    "m-ric/english_historical_quotes", split="train"
)

# Convert to pandas DataFrame
df = pd.DataFrame(dataset)

embedding_model = sentence_transformers.SentenceTransformer(
    "BAAI/bge-base-en-v1.5"
)
labels = df["quote"].tolist()
embeddings = embedding_model.encode(labels, show_progress_bar=True)

In [33]:
# Fit the network, passing in custom data
sem = SemanticNetwork()

G = sem.fit_transform(
    embeddings=embeddings,
    labels=labels,
    thresh=0.3,
    top_k=20,
    node_data={
        n: {"type": "quote", "author": df["author"].iloc[n]}
        for n, _ in enumerate(labels)
    },
)

In [43]:
# Calculate degree centrality for all nodes
import networkx as nx

# Drop disconnected nodes
subgraphs = list(nx.connected_components(G))
largest_subgraph = max(subgraphs, key=len)
G = G.subgraph(largest_subgraph).copy()

# Calculate degree centrality for all nodes
centrality = nx.degree_centrality(G)
nx.set_node_attributes(G, centrality, "degree_centrality")

# Get louvain communities
communities = nx.community.louvain_communities(G)
community_dict = {}
for i, community in enumerate(communities):
    for node in community:
        community_dict[node] = i
nx.set_node_attributes(G, community_dict, "community")

In [44]:
nodes, edges = sem.to_pandas(G)

In [None]:
help(cosmo)

In [None]:
from cosmograph import Cosmograph

cosmo = Cosmograph(
    points=nodes,
    links=edges,
    point_id_by="node_id",
    point_size_by="degree_centrality",
    link_source_by="source",
    link_target_by="target",
    point_color_by="community",
    point_cluster_by="community",
    point_label_by="label",
    show_cluster_labels=True,
    point_include_columns=["author", "degree_centrality", "community"],
)
cosmo

Cosmograph(background_color=None, components_display_state_mode=None, focused_point_ring_color=None, hovered_p…

# Building the network
Semnet makes constructing an embedding-based network simple. Just bring your own embeddings and pass them to the `.fit()` method.

In [None]:
from semnet import SemanticNetwork
from sentence_transformers import SentenceTransformer

docs = df["quote"].tolist()

embedding_model = SentenceTransformer("BAAI/bge-base-en-v1.5")
embeddings = embedding_model.encode(docs, show_progress_bar=True)

## API
We pass the docs as the labels, and also pass the author as additional data to each node in the network. Passing a single item using `.to_dict()` will record the key as `value`.

In [None]:
# Larger values for thresh will generate sparser networks with fewer edges and more outliers
# I've found good values to be between 1.5 and 5. 3 will get a strong core, but with a large number of outliers
sem = SemanticNetwork(thresh=0.3, n_trees=100)
sem.fit(embeddings=embeddings, labels=docs, node_data=df["author"].to_dict())

## Under the hood
Semnet uses [annoy](https://github.com/spotify/annoy) to perform rapid pair-wise distance calculations across all embeddings in the dataset.

The result of this process is an edgelist, which can be used to construct an undirected graph, weighted by the semantic similarity between each record.

# Network analysis of text
With our data loaded into a `networkx` object, we now have access to hundreds of graph-based algorithms that can be used to explore, analyse and clean our data.

Use cases include:
- Outlier detection
- Enriching network with non-semantic data
- Clustering
- Visualisation
- Semantic pathways
- Deduplication and more!

## Outlier detection
Any node that has no edges, has no semantic relationship with any other item in the dataset at the threshold set during training. These records may be considered outliers.

We can use networkx to find all connected components in the graph. The demo threshold is pretty high so we'll see a fair few outliers.

In [None]:
import networkx as nx
import random

G = sem.graph_.copy()

# Returns generator of sets of connected components
connected_components = list(nx.connected_components(G))
unconnected_components = [
    list(c)[0] for c in connected_components if len(c) == 1
]

print(
    f"Number of connected components (groups of 2 or more nodes): {len(connected_components)}"
)
print(
    f"Number of unconnected components (outliers): {len(unconnected_components)}"
)

Outliers, in this context represent topics or phrasing that is somewhat unique within the dataset. Exploring the outliers, we can see references to Mexico, Pearl Habour, puppies and such.

In [None]:
def sample_nodes(subgraph, n=5, seed=12345):
    random.seed(seed)
    node_candidates = list(subgraph.nodes(data=True))
    if len(node_candidates) < n:
        n = len(node_candidates)
    sample_nodes = random.sample(node_candidates, n)
    for idx, data in sample_nodes:
        print(f"{data['name']}, {data['value']}")


largest_cc = max(nx.connected_components(G), key=len)
large_subgraph = G.subgraph(largest_cc)

print("Largest")
sample_nodes(large_subgraph, n=10)
print()
print("Outliers")
sample_nodes(G.subgraph(unconnected_components), n=10)

How we treat outliers will depend on our use case. As a demonstration, I'm keen at looking at the core of the dataset, getting themes, vibes and relationships rather than trying to classify every node. I drop the outliers and focus on the centre.

In [None]:
print(len(G.nodes()), len(unconnected_components))
non_ouliers = [n for n in G.nodes() if n not in unconnected_components]
G = G.subgraph(non_ouliers)
print(f"Graph after removing outliers has {len(G.nodes())} nodes")

# Clustering

Whilst excellent methods and libraries (e.g., BerTopic) exist for topic modelling on embeddings, the graph structure allows us to use a _relationship_-based approach

In [None]:
communities = nx.community.louvain_communities(G, seed=123, resolution=1.5)
for i, comm in enumerate(sorted(communities, key=len, reverse=True)):
    print(f"Community {i+1}, size: {len(comm)}")
    subgraph = G.subgraph(comm)
    sample_nodes(subgraph, n=5)
    print()

    # Label nodes with their community, I put small communities into -1
    for node in comm:
        if len(comm) > 5:
            G.nodes[node]["community"] = i + 1
        else:
            G.nodes[node]["community"] = -1

In [None]:
from cosmograph import cosmo

# Use the new to_pandas method to export the graph
nodes, edges = sem.to_pandas(G)

# For cosmograph, we need to prepare the data
widget = cosmo(
    points=nodes,
    links=edges,
    point_id_by="id",  # Index column
    link_source_by="source",
    link_target_by="target",
    link_strength_by="similarity",
    point_color_by="community",  # Color by community
    point_cluster_by="community",
    show_hovered_point_label=True,
    select_point_on_click=True,
    point_include_columns=["value"],  # Include author info
    point_label_by="name",
)
widget

In [None]:
# Shortest path between two nodes
import random


for n in range(10):
    print("\n" + "-" * 50 + "\n")
    random.seed(n)
    node_a, node_b = random.sample(list(large_subgraph.nodes(data=True)), 2)

    all_path = nx.all_simple_paths(
        large_subgraph,
        source=node_a[0],
        target=node_b[0],
        cutoff=20,
    )
    # Find the longest path
    sorted_paths = sorted(all_path, key=len, reverse=True)
    long_path = sorted_paths[0] if len(sorted_paths) > 0 else None

    if long_path is not None:
        print(
            f"Long path between:\n- {node_a[1]['name']}\n- {node_b[1]['name']}\n"
        )
        for idx in long_path:
            print(f"- {large_subgraph.nodes[idx]['name']}")

In [None]:
# Clustering

In [None]:
communities = []
for idx, community in points.groupby("community"):
    top_nodes = community.nlargest(5, "degree_centrality")
    communities.append(
        {
            "community_id": idx,
            "representative_docs": top_nodes["name"].values,
            "size": len(community),
        }
    )

for community in sorted(communities, key=lambda x: x["size"], reverse=True):
    print(f"Community {community['community_id']} (size={community['size']}):")
    for doc in community["representative_docs"]:
        print(f" - {doc}")
    print()

In [None]:
widget

In [None]:
# Shortest path between two nodes
import random


def find_shortest_path(graph, source_idx, target_idx):
    try:
        path = nx.shortest_path(graph, source=source_idx, target=target_idx)
        return path
    except nx.NetworkXNoPath:
        return None


largest_component = max(connected_components, key=len)
largest_subgraph = reduced_graph.subgraph(largest_component)

node_a, node_b = random.sample(list(largest_subgraph.nodes(data=True)), 2)
path = find_shortest_path(
    largest_subgraph, source_idx=node_a[0], target_idx=node_b[0]
)
for idx in path:
    print(f"- {largest_subgraph.nodes[idx]['name']}")