<a href="https://colab.research.google.com/github/sadnyd/EdgarGraphLLM/blob/main/notebooks/embedding_gen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install dotenv langchain_community neo4j langchain load_dotenv python_dotenv



In [None]:
from dotenv import load_dotenv
import os

from langchain_community.graphs import Neo4jGraph

# Warning control
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Load from environment
# load_dotenv('.env', override=True)

from google.colab import userdata

NEO4J_URI = userdata.get('NEO4J_URI')
NEO4J_USERNAME = userdata.get('NEO4J_USERNAME')
NEO4J_PASSWORD = userdata.get('NEO4J_PASSWORD')
NEO4J_DATABASE = userdata.get('neo4j_database')
GEMINI_API_KEY = userdata.get('gemini')
GEMINI_ENDPOINT = 'https://generativelanguage.googleapis.com' + '/v1beta/models/text-embedding-004:embedText'



In [None]:
# Connect to the knowledge graph instance using LangChain
kg = Neo4jGraph(
    url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD, database=NEO4J_DATABASE
)

In [None]:
#kg.query("""
#  DROP INDEX movie_tagline_embeddings IF EXISTS;""")

kg.query("""CREATE VECTOR INDEX movie_tagline_embeddings IF NOT EXISTS
  FOR (m:Movie) ON (m.taglineEmbedding)
  OPTIONS { indexConfig: {
    `vector.dimensions`: 768,
    `vector.similarity_function`: 'cosine'
  }}
""")

[]

In [None]:
kg.query("""
  SHOW VECTOR INDEXES
  """
)

[{'id': 3,
  'name': 'movie_tagline_embeddings',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'VECTOR',
  'entityType': 'NODE',
  'labelsOrTypes': ['Movie'],
  'properties': ['taglineEmbedding'],
  'indexProvider': 'vector-2.0',
  'owningConstraint': None,
  'lastRead': neo4j.time.DateTime(2025, 4, 10, 17, 52, 18, 442000000, tzinfo=<UTC>),
  'readCount': 2}]

In [None]:
from google.generativeai import embed_content
import google.generativeai as genai

genai.configure(api_key=GEMINI_API_KEY)

def get_gemini_embedding(text):
    response = embed_content(
        model="models/text-embedding-004",
        content=text,
        task_type="retrieval_document"
    )
    return response["embedding"]


In [None]:
def update_tagline_embeddings():
    records = kg.query("""
        MATCH (m:Movie)
        WHERE m.tagline IS NOT NULL
        RETURN m.title AS title, m.tagline AS tagline
    """)

    for record in records:
        tagline = record['tagline']
        title = record['title']

        # Only process if tagline is available
        if tagline:
            try:
                embedding = get_gemini_embedding(tagline)
                kg.query("""
                    MATCH (m:Movie {title: $title})
                    SET m.taglineEmbedding = $embedding
                """, params={"title": title, "embedding": embedding})
            except Exception as e:
                print(f"Failed for '{title}': {e}")


In [None]:
update_tagline_embeddings()

In [None]:
kg.query("""MATCH (m:Movie) RETURN m LIMIT 5""")

[{'m': {'taglineEmbedding': [-0.0011065466,
    0.016216671,
    -0.03631471,
    -0.06212407,
    0.002635042,
    0.00600059,
    0.024535041,
    0.06713677,
    -0.060820192,
    0.009529159,
    0.019374112,
    -0.014781364,
    0.10167999,
    -0.008362532,
    -0.030960783,
    -0.06826005,
    0.010039638,
    0.03539896,
    -0.082762994,
    0.015225434,
    0.03536297,
    -0.0044792965,
    0.0025851612,
    -0.010418199,
    -0.020928966,
    0.013303761,
    0.028166343,
    -0.023311133,
    -0.0507194,
    0.0060528335,
    0.013912792,
    0.037257787,
    0.013612222,
    -0.012165591,
    0.060206458,
    -0.0018885855,
    -0.011883812,
    0.008588917,
    0.021107217,
    -0.01563533,
    -0.09332747,
    0.015777975,
    -0.07124745,
    0.0018595429,
    -0.04510498,
    -0.03382996,
    -0.020676386,
    -0.0018413736,
    -0.035965532,
    0.05553882,
    0.011375148,
    0.035368178,
    -0.031173998,
    0.05538489,
    -0.016967893,
    -0.020549731,
    -

In [None]:
result = kg.query("""
    MATCH (m:Movie)
    WHERE m.tagline IS NOT NULL
    RETURN m.tagline, m.taglineEmbedding
    LIMIT 1
    """
)

In [None]:
result[0]['m.tagline']

'Your mind is the scene of the crime'

In [None]:
result[0]['m.taglineEmbedding'][:10]

[-0.0011065466,
 0.016216671,
 -0.03631471,
 -0.06212407,
 0.002635042,
 0.00600059,
 0.024535041,
 0.06713677,
 -0.060820192,
 0.009529159]

In [None]:
len(result[0]['m.taglineEmbedding'])

768

In [None]:
question = "What movies are about drama?"

In [None]:
# Step 1: Generate the question embedding using Gemini
question_embedding = get_gemini_embedding(question)

# Step 2: Run the Neo4j query with the generated embedding
kg.query("""
    CALL db.index.vector.queryNodes(
        'movie_tagline_embeddings',
        $top_k,
        $question_embedding
    ) YIELD node AS movie, score
    RETURN movie.title, movie.tagline, score
""",
params={
    "question_embedding": question_embedding,
    "top_k": 5
})


[{'movie.title': 'The Wolf of Wall Street',
  'movie.tagline': 'The story of a man who had everything. And lost it all.',
  'score': 0.8684883117675781},
 {'movie.title': 'The Dark Knight',
  'movie.tagline': 'Why so serious?',
  'score': 0.8666071891784668},
 {'movie.title': 'Catch Me If You Can',
  'movie.tagline': 'The true story of a real fake.',
  'score': 0.8647251129150391},
 {'movie.title': 'The Notebook',
  'movie.tagline': 'Behind every great love is a great story.',
  'score': 0.859947681427002},
 {'movie.title': 'Harry Potter and the Prisoner of Azkaban',
  'movie.tagline': 'Something wicked this way comes.',
  'score': 0.8515377044677734}]