In [14]:
# ------------------ 1. ⚙️ Install Dependencies ------------------
%pip install --quiet --upgrade langchain langchain-community langchain-openai langchain-experimental neo4j pandas


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [15]:
# ------------------ 2. 🔐 Environment Setup ------------------
import os
from dotenv import load_dotenv


# 🧹 Clear existing environment variables
os.environ.pop("NEO4J_URI", None)
os.environ.pop("NEO4J_USERNAME", None)
os.environ.pop("NEO4J_PASSWORD", None)

load_dotenv()

# 🗝️ Set environment variables
openai_api_key = os.environ["OPENAI_API_KEY"]
NEO4J_URI = os.environ["NEO4J_URI"]
NEO4J_USERNAME= os.environ["NEO4J_USERNAME"]
NEO4J_PASSWORD= os.environ["NEO4J_PASSWORD"]



print("Environment variables loaded successfully.")
print("OpenAI API Key:", openai_api_key)
print("Neo4j URI:", NEO4J_URI)
print("Neo4j Username:", NEO4J_USERNAME)
print("Neo4j Password:", NEO4J_PASSWORD)


csv_path = os.path.abspath(os.path.join(os.getcwd(), "..", "Data", "movie_data.csv"))

Environment variables loaded successfully.
OpenAI API Key: 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
Neo4j URI: neo4j+s://40545226.databases.neo4j.io
Neo4j Username: neo4j
Neo4j Password: gr5iTU6EppPC6VHDKtYoc5FWqhZxZU3MEmK1xAsVi-E


In [None]:
from neo4j import GraphDatabase
import pandas as pd
from tqdm import tqdm

class NetflixGraphDB:
    def __init__(self, NEO4J_URI, NEO4J_USERNAME, NEO4J_PASSWORD):
        self.driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))
        print("✅ Connected to Neo4j")

    def close(self):
        self.driver.close()
        print("🔌 Connection closed")

    def clear_database(self):
        with self.driver.session() as session:
            session.run("MATCH (n) DETACH DELETE n")
            print("🧹 Cleared entire graph")

    def load_movies_from_csv(self, csv_path):
        df = pd.read_csv(csv_path)
        df.columns = [col.strip().lower().replace(" ", "_") for col in df.columns]
    
        print(f"📄 Loading {len(df)} movie rows from: {csv_path}")

        with self.driver.session() as session:
            for _, row in tqdm(df.iterrows(), total=len(df), desc="🚀 Ingesting movies"):
                row = row.where(pd.notnull(row), None)

                title = row.get("title")
                release_year = row.get("release_year")
                duration = row.get("duration")
                rating = row.get("rating")
                description = row.get("description")
                show_type = row.get("type")
                country = row.get("country")
                genres = row.get("listed_in")
                date_added = row.get("date_added")

                if not title:
                    print("⚠️ Skipping movie with missing title.")
                    continue

            # Movie node
                session.run(
                """
                MERGE (m:Show {title: $title, release_year: $release_year})
                SET m.duration = $duration,
                    m.rating = $rating,
                    m.description = $description,
                    m.date_added = $date_added,
                    m.type = $type
                """,
                {
                    "title": title,
                    "release_year": release_year,
                    "duration": duration,
                    "rating": rating,
                    "description": description,
                    "date_added": date_added,
                    "type": show_type
                }
            )

            # Country relationship
                if country:
                    session.run(
                    """
                    MERGE (c:Country {name: $country})
                    WITH c
                    MATCH (m:Show {title: $title, release_year: $release_year})
                    MERGE (m)-[:PRODUCED_IN]->(c)
                    """,
                    {"country": country, "title": title, "release_year": release_year}
                )

            # Duration as a separate node (optional)
                if duration:
                    session.run(
                    """
                    MERGE (d:Duration {label: $duration})
                    WITH d
                    MATCH (m:Show {title: $title, release_year: $release_year})
                    MERGE (m)-[:HAS_DURATION]->(d)
                    """,
                    {"duration": duration, "title": title, "release_year": release_year}
                )

            # Genres
                if genres:
                    for genre in genres.split(","):
                        genre = genre.strip()
                        if genre:
                         session.run(
                            """
                            MERGE (g:Genre {name: $genre})
                            WITH g
                            MATCH (m:Show {title: $title, release_year: $release_year})
                            MERGE (m)-[:IN_GENRE]->(g)
                            """,
                            {"genre": genre, "title": title, "release_year": release_year}
                        )

            # Directors
                for director in str(row.get("director", "")).split(","):
                    director = director.strip()
                    if director:
                        session.run(
                        """
                        MERGE (p:Person {name: $name})
                        WITH p
                        MATCH (m:Show {title: $title, release_year: $release_year})
                        MERGE (p)-[:DIRECTED]->(m)
                        """,
                        {"name": director, "title": title, "release_year": release_year}
                    )

            # Cast
                for actor in str(row.get("cast", "")).split(","):
                    actor = actor.strip()
                    if actor:
                        session.run(
                        """
                        MERGE (p:Person {name: $name})
                        WITH p
                        MATCH (m:Show {title: $title, release_year: $release_year})
                        MERGE (p)-[:ACTED_IN]->(m)
                        """,
                        {"name": actor, "title": title, "release_year": release_year}
                    )

    print("✅ All data ingested into Neo4j.")



✅ All data ingested into Neo4j.


In [17]:
# Create an instance of NetflixGraphDB
db = NetflixGraphDB(NEO4J_URI, NEO4J_USERNAME, NEO4J_PASSWORD)

db.clear_database()  # Optional: wipe old data
db.load_movies_from_csv(csv_path)

# Always close when done
db.close()

✅ Connected to Neo4j
🧹 Cleared entire graph
📄 Loading 390 movie rows from: /home/prashant-agrawal/Netflix_Project/Data/movie_data.csv


🚀 Ingesting movies: 100%|██████████| 390/390 [04:44<00:00,  1.37it/s]

🔌 Connection closed





In [87]:
# ------------------ 3. 🧠 Load LLM (OpenAI GPT-4o) ------------------
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(
    model="gpt-4o",
    openai_api_key=os.getenv("OPENAI_API_KEY")
)

In [86]:
from langchain_community.graphs import Neo4jGraph  # 👈 compatible with GraphCypherQAChain
from langchain.chains.graph_qa.cypher import GraphCypherQAChain
graph = Neo4jGraph(
    url=os.getenv("NEO4J_URI"),
    username=os.getenv("NEO4J_USERNAME"),
    password=os.getenv("NEO4J_PASSWORD")
)

#chain = GraphCypherQAChain.from_llm(
#    llm=llm,
#    graph=graph,
#    verbose=True,
#    allow_dangerous_requests=True
#)


In [93]:

# ✅ Prompt Template for GraphCypherQAChain (Structured Output)
from langchain.chains.graph_qa.cypher import GraphCypherQAChain
from langchain.prompts.prompt import PromptTemplate

# Create a structured JSON output with committed answers
structured_prompt = PromptTemplate(
    input_variables=["schema", "question"],
    template="""
You are an expert Neo4j Cypher assistant and data interpreter.

You are given the graph schema below:

{schema}

And the user query:
{question}

Your tasks are:
1. Write the best possible Cypher query to retrieve relevant data.
2. Extract the correct answer from the results.
3. Present the final answer in this exact JSON format:

{{
  "answer": "<natural language summary of the result>",
  "results": [
    {{
      "title": "<show or movie title>",
      "release_year": <year>,
      "rating": "<rating>",
      "country": "<producing country>",
      "genres": ["<genre1>", "<genre2>"]
    }},
    ...
  ]
}}

Only use the data available in the database. If no data is found, respond with:
{{
  "answer": "No data found for this query.",
  "results": []
}}
"""
)

# ⚙️ Usage Example
chain = GraphCypherQAChain.from_llm(
    graph=graph,
    llm=llm,
    cypher_prompt=structured_prompt,
    verbose=True,
    allow_dangerous_requests=True
)



In [80]:
def pretty_print_graph_response(context):
    if not context:
        print("❌ No results found in the graph.")
        return

    print("✅ Raw Graph Results:\n")
    for i, entry in enumerate(context, start=1):
        print(f"🔹 Result {i}:")
        for key, val in entry.items():
            if isinstance(val, dict):
                for sub_key, sub_val in val.items():
                    print(f"   {sub_key}: {sub_val}")
            else:
                print(f"   {key}: {val}")
        print("-" * 50)

In [101]:
import langchain
langchain.verbose = False

response = chain.invoke({
    "query": "Find the 10 recently released movie produced in the United States."
})
#print(response)

#response = chain.invoke({"query": "Find the most recently released movie produced in the United States."})
#raw_data = response.get("context", [])
#print(response)




[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (s:Show)-[:PRODUCED_IN]->(c:Country {name: "United States"}),
      (s)-[:IN_GENRE]->(g:Genre)
WHERE s.type = "Movie"
RETURN s.title AS title, s.release_year AS release_year, s.rating AS rating, c.name AS country, collect(g.name) AS genres
ORDER BY s.release_year DESC
LIMIT 10
[0m
Full Context:
[32;1m[1;3m[{'title': 'After We Collided', 'release_year': 2020, 'rating': 'R', 'country': 'United States', 'genres': ['Dramas', 'Romantic Movies']}, {'title': 'Airplane Mode', 'release_year': 2020, 'rating': 'TV-PG', 'country': 'United States', 'genres': ['International Movies', 'Comedies', 'Romantic Movies']}, {'title': 'A Love Song for Latasha', 'release_year': 2020, 'rating': 'TV-PG', 'country': 'United States', 'genres': ['Documentaries']}, {'title': 'A Secret Love', 'release_year': 2020, 'rating': 'TV-14', 'country': 'United States', 'genres': ['Documentaries', 'LGBTQ Movies']}, {'title': 'A

In [None]:
##Tools

import pandas as pd

flattened_data = []
for r in response[""]:
    for entry in r["context"]:
        flat_entry = {**entry, "query": r["query"]}
        flattened_data.append(flat_entry)

df = pd.DataFrame(flattened_data)
df.to_csv("query_results.csv", index=False)