In [None]:
# HARE KRISHNA
# Final Scripture for Enriching the Existing Graph with Embeddings

# -----------------------------------------------------------------------------
# STEP 1: ACQUIRE THE TOOLS
# -----------------------------------------------------------------------------
print("Acquiring the necessary tools...")
%pip install -q neo4j pandas sentence-transformers tqdm

import pandas as pd
from neo4j import GraphDatabase
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

# -----------------------------------------------------------------------------
# STEP 2: PRUDENT CONFIGURATION
# -----------------------------------------------------------------------------
# Use the credentials for your EXISTING AuraDB instance.
NEO4J_URI = "HIDDEN"
NEO4J_AUTH = ("HIDDEN", "HIDDEN")

# -----------------------------------------------------------------------------
# STEP 3: PREPARE THE JNANA (HARVEST & EMBED)
# -----------------------------------------------------------------------------
print("Connecting to the existing English graph...")
driver = GraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH)

# This mantra harvests all nodes that have text and need an embedding.
harvest_query = """
MATCH (n)
WHERE n.text IS NOT NULL AND n.text <> ''
RETURN n.original_id AS id, n.text AS text
"""

print("Harvesting all node texts directly from the database...")
node_data = []
with driver.session(database="neo4j") as session:
    results = session.run(harvest_query)
    for record in results:
        node_data.append({"id": record["id"], "text": record["text"]})
print(f"Successfully harvested {len(node_data)} node texts.")

print("\nLoading the sentence-transformer model...")
model = SentenceTransformer('all-MiniLM-L6-v2')

print("Creating vector embeddings for all texts... (This is the main computation)")
node_texts = [item['text'] for item in node_data]
node_embeddings = model.encode(node_texts, show_progress_bar=True)

# Add the embedding back to our list of dictionaries.
for i, item in enumerate(node_data):
    item['embedding'] = node_embeddings[i].tolist()
print("Embedding creation complete.")

# -----------------------------------------------------------------------------
# STEP 4: THE YAJNA OF ENRICHMENT (UPDATING THE GRAPH)
# -----------------------------------------------------------------------------
# This mantra finds each node by its unique ID and SETS the new embedding property.
update_query = """
UNWIND $rows AS row
MATCH (n {original_id: row.id})
SET n.embedding = row.embedding
"""

batch_size = 400
print("\nBeginning the enrichment. This will take time...")
with driver.session(database="neo4j") as session:
    for i in tqdm(range(0, len(node_data), batch_size), desc="Enriching Graph with Embeddings"):
        batch = node_data[i:i + batch_size]
        if batch:
            session.run(update_query, rows=batch)

driver.close()
print(f"\n--- ENRICHMENT COMPLETE ---")
print("The knowledge graph has been successfully enriched with embeddings.")

Acquiring the necessary tools...
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.2/313.2 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25hConnecting to the existing English graph...
Harvesting all node texts directly from the database...
Successfully harvested 44637 node texts.

Loading the sentence-transformer model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Creating vector embeddings for all texts... (This is the main computation)


Batches:   0%|          | 0/1395 [00:00<?, ?it/s]

Embedding creation complete.

Beginning the enrichment. This will take time...


Enriching Graph with Embeddings: 100%|██████████| 112/112 [29:45<00:00, 15.94s/it]


--- ENRICHMENT COMPLETE ---
The knowledge graph has been successfully enriched with embeddings.



