# Load data
First we load the data, I've pulled a [generic quotations dataset](https://huggingface.co/datasets/m-ric/english_historical_quotes/) from huggingface, it's downloaded to the repo for convenience.

In [2]:
import json
import pandas as pd

# https://huggingface.co/datasets/m-ric/english_historical_quotes/blob/main/english_historical_quotes.json

# Load in from json
with open("data/english_historical_quotes.json", "r") as f:
    data = json.load(f)

# I drop the category file as it's not required for the demo
df = pd.DataFrame(data).drop(columns=["category"])

df.head()

Unnamed: 0,quote,author
0,Almost anyone can be an author the business is...,A. A. Milne
1,"If you live to be a hundred, I want to live to...",A. A. Milne
2,Golf is so popular simply because it is the be...,A. A. Milne
3,"To the uneducated, an A is just three sticks.",A. A. Milne
4,Promise me you'll always remember: You're brav...,A. A. Milne


# Building network
Building an embedding-based network is simple. Bring your own embeddings and pass to the fit method.

Docs are a required argument, 

In [None]:
from semnet import SemanticNetwork
from sentence_transformers import SentenceTransformer

docs = df["quote"].tolist()

embedding_model = SentenceTransformer("BAAI/bge-base-en-v1.5")
embeddings = embedding_model.encode(docs, show_progress_bar=True)

sem = SemanticNetwork(thresh=0.25)
sem.fit(embeddings=embeddings, labels=docs)

In [None]:
import networkx as nx

graph = sn.graph_
for node in graph.nodes:
    record = df.iloc[node]
    # Add data to each node
    graph.nodes[node]["author"] = record["author"]

In [None]:
from cosmograph import cosmo
import pandas as pd

points = []
for idx, record in graph.nodes(data=True):
    record.update({"idx": idx})
    points.append(record)

points = pd.DataFrame(points)

links = []
for source, target, edge_data in graph.edges(data=True):
    record = {
        "source": source,
        "target": target,
    }
    record.update(edge_data)
    links.append(record)

links = pd.DataFrame(links)

In [None]:
widget = cosmo(
    points=points,
    links=links,
    point_id_by="idx",
    link_source_by="source",
    link_target_by="target",
    link_strength_by="similarity",
    select_point_on_click=True,
    point_include_columns=["author"],
    point_label_by="name",
)
widget

In [None]:
communities = []
for idx, community in points.groupby("community"):
    top_nodes = community.nlargest(5, "degree_centrality")
    communities.append(
        {
            "community_id": idx,
            "representative_docs": top_nodes["name"].values,
            "size": len(community),
        }
    )

for community in sorted(communities, key=lambda x: x["size"], reverse=True):
    print(f"Community {community['community_id']} (size={community['size']}):")
    for doc in community["representative_docs"]:
        print(f" - {doc}")
    print()

In [None]:
widget

In [None]:
# Shortest path between two nodes
import random


def find_shortest_path(graph, source_idx, target_idx):
    try:
        path = nx.shortest_path(graph, source=source_idx, target=target_idx)
        return path
    except nx.NetworkXNoPath:
        return None


largest_component = max(connected_components, key=len)
largest_subgraph = reduced_graph.subgraph(largest_component)

node_a, node_b = random.sample(list(largest_subgraph.nodes(data=True)), 2)
path = find_shortest_path(largest_subgraph, source_idx=node_a[0], target_idx=node_b[0])
for idx in path:
    print(f"- {largest_subgraph.nodes[idx]['name']}")