# 🎬 MovieBot
Ask me anything about movie plots!
# RAG application to answer movie plots by their name.

## Features:
- Uses `LangGraph` to structure the app
- Uses wikipedia-movie-plotsdataset for RAG using chroma
- Simple chat interface

**Type 'exit' to quit the chat.**

Setup
First, install ChromaDB and the Gemini API Python SDK.

In [1]:

!pip uninstall -qqy jupyterlab kfp  # Remove unused conflicting packages
!pip install -qU "google-genai==1.7.0" "chromadb==0.6.3"


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m144.7/144.7 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m611.1/611.1 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m53.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.9/100.9 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m284.2/284.2 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.2/95.2 kB[0m [31m4

In [2]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [3]:
from google import genai
from google.genai import types

from IPython.display import Markdown

genai.__version__

'1.7.0'

Set up AI key stored in Kaggle secret names GOOGLE_API_KEY.To make the key available through Kaggle secrets, choose Secrets from the Add-ons menu and follow the instructions to add your key or enable it for this notebook.

In [4]:
from kaggle_secrets import UserSecretsClient

GOOGLE_API_KEY = UserSecretsClient().get_secret("GOOGLE_API_KEY")

In [5]:
client = genai.Client(api_key=GOOGLE_API_KEY)

Read CSV using pandas and for each row store 'Title' and 'plot' in docs.
Generates list of documents(docs), each containing the title and plot of the movie.

In [6]:
import pandas as pd

df = pd.read_csv("/kaggle/input/wikipedia-movie-plots/wiki_movie_plots_deduped.csv")
#df_filtered = df[(df['Release Year'] >= 2000) & (df['Release Year'] < 2026) & (df['Genre'].str.contains('action', case=False, na=False))]
docs = [
    f"Title: {row['Title']}\nPlot: {row['Plot']}"
    for _, row in df.iterrows()
]
#df_filtered.count()
df.count()
len(docs)

34886

Creating an embedding database using ChromaDB involves generating document embeddings with the Gemini API using the retrieval_document task type. These embeddings support a retrieval system where query embeddings (retrieval_query) are used later for matching.

In [7]:
from chromadb import Documents, EmbeddingFunction, Embeddings
from google.api_core import retry

from google.genai import types
is_retriable = lambda e: (isinstance(e, genai.errors.APIError) and e.code in {429, 503})
class GeminiEmbeddingFunction(EmbeddingFunction):
    # Specify whether to generate embeddings for documents, or queries
    document_mode = True

    @retry.Retry(predicate=is_retriable)
    def __call__(self, input: Documents) -> Embeddings:
        if self.document_mode:
            embedding_task = "retrieval_document"
        else:
            embedding_task = "retrieval_query"

        response = client.models.embed_content(
            model="models/text-embedding-004",
            contents=input,
            config=types.EmbedContentConfig(
                task_type=embedding_task,
            ),
        )
        return [e.values for e in response.embeddings]

### Creating and Populating a ChromaDB Collection with Gemini Embeddings

This code initializes a ChromaDB collection using a custom `GeminiEmbeddingFunction` for document embeddings.  
It processes documents in batches and adds them to the database with unique IDs.  
Errors during embedding or addition are handled with specific messages for debugging.  
The batching ensures memory efficiency and prevents API overload, completing with a confirmation message.  
(Takes 349 batches for this data)

In [8]:
import chromadb

DB_NAME = "googlecardb"

embed_fn = GeminiEmbeddingFunction()
embed_fn.document_mode = True

chroma_client = chromadb.Client()
db = chroma_client.get_or_create_collection(name=DB_NAME, embedding_function=embed_fn)

#db.add(documents=docs, ids=[str(i) for i in range(len(docs))])
def embed_batch(batch_docs):
    """Embeds a batch of documents using your embedding function."""
    return embed_fn(input=batch_docs)

batch_size = 100  # Adjust this value
num_docs = len(docs)

for i in range(0, num_docs, batch_size):
    batch_docs = docs[i : i + batch_size]
    batch_ids = [str(j) for j in range(i, i + len(batch_docs))]

    if batch_docs:
        try:
            db.add(documents=batch_docs, ids=batch_ids)
            print(f"Added batch {i // batch_size + 1} of size {len(batch_docs)}")
        except genai_errors.ClientError as e:
            print(f"Google AI ClientError in batch {i // batch_size + 1}:")
            print(f"  Status Code: {e.status_code}")
            print(f"  Error Message: {e.message}")
            # You might want to implement more specific handling here,
            # like reducing the batch size further or logging the error.
        except Exception as e:
            print(f"An unexpected error occurred in batch {i // batch_size + 1}: {e}")
            # Handle other potential exceptions
# ... rest of your code ...

print("Finished adding all documents in batches.")

Added batch 1 of size 100
Added batch 2 of size 100
Added batch 3 of size 100
Added batch 4 of size 100
Added batch 5 of size 100
Added batch 6 of size 100
Added batch 7 of size 100
Added batch 8 of size 100
Added batch 9 of size 100
Added batch 10 of size 100
Added batch 11 of size 100
Added batch 12 of size 100
Added batch 13 of size 100
Added batch 14 of size 100
Added batch 15 of size 100
Added batch 16 of size 100
Added batch 17 of size 100
Added batch 18 of size 100
Added batch 19 of size 100
Added batch 20 of size 100
Added batch 21 of size 100
Added batch 22 of size 100
Added batch 23 of size 100
Added batch 24 of size 100
Added batch 25 of size 100
Added batch 26 of size 100
Added batch 27 of size 100
Added batch 28 of size 100
Added batch 29 of size 100
Added batch 30 of size 100
Added batch 31 of size 100
Added batch 32 of size 100
Added batch 33 of size 100
Added batch 34 of size 100
Added batch 35 of size 100
Added batch 36 of size 100
Added batch 37 of size 100
Added batc

In [9]:
db.count()

34886

Retrieval: Find relevant documents
To search the Chroma database, call the query method. Switch to the retrieval_query mode of embedding generation.

In [10]:
# Switch to query mode when generating embeddings.
embed_fn.document_mode = False

# Search the Chroma DB using the specified query.
query = "which is the scariest movie?"

result = db.query(query_texts=[query], n_results=1)
[all_passages] = result["documents"]

Markdown(all_passages[0])

Title: Clownhouse
Plot: The story follows Casey, a normal boy whose life is constantly influenced by his intense fear of clowns. His two older brothers, Geoffrey and Randy, are mostly disobliging. One night, the three boys are left alone when their mother visits relatives, so they decide to visit a local circus for a night of amusement, despite Casey's uncontrollable coulrophobia. Meanwhile, the local state insane asylum has sent a majority of the hospital's inmates to the carnival for therapy, but three psychotic mental patients break away from the group and kill three clowns, taking their makeup and costumes.
While at the circus, Casey innocently visits a fortune teller despite Randy's better judgment. The fortune teller reveals to Casey that his life line has been cut short, and says to him: "Beware, beware, in the darkest of dark /though the flesh is young and the hearts are strong /precious life cannot be long /when darkest death has left its mark."
As the boys return from the circus, a shaken Casey thinks his nightmare is over, but it has only just begun. When the clowns target their home, Casey is forced to face his fears once and for all. Casey and his brothers are locked inside their isolated farmhouse and the power is turned off. Casey attempts to call the police, but because Casey says that the "clowns from the circus are trying to get him", the police officers assume that Casey's fear of clowns caused him to have a realistic nightmare. The officers tell Casey that everything will be fine if he goes back to sleep, and hangs up.
Randy mockingly dresses up as a clown, disbelieving of Casey's claims that clowns are inside the house. His plan to jump out at Geoffrey and Casey is cut short after he is stabbed by one of the clowns. Geoffrey manages to kill the first clown by hitting him with a wooden plank, knocking him down a flight of stairs and breaking his neck.
Later on, after tricking the clown, Casey and Geoffrey push another clown out a window to his death. Casey and Geoffrey find Randy unconscious in a closet and drag him into another room. Geoffrey is then attacked and presumably killed by the final clown, who chases Casey into the upstairs game room. Casey manages to hide for the time being, but after the clown leaves, Casey accidentally steps on a noise-making toy, alerting the clown of his presence. The enraged clown attempts to break Casey's neck, but he is then killed by Geoffrey (who survived the clown's attack), slamming a hatchet into the killer's back, and the two exhausted and traumatized brothers hug each other as the police finally arrive to help them.
The film ends with this narration:

### Constructing a Prompt for Gemini Query Response

This code prepares a custom prompt for Gemini by first formatting the user query into a single line.  
It defines the assistant's tone and instructions for how to answer, encouraging complete, friendly, and relevant responses.Each retrieved document passage is cleaned and appended to the prompt as context.  
The final prompt is printed, ready to be used for generating a contextual response from the model.  


In [11]:
query_oneline = query.replace("\n", " ")

# This prompt is where you can specify any guidance on tone, or what topics the model should stick to, or avoid.
prompt = f"""You are a helpful and informative bot that answers questions using text from the reference passage included below. 
Be sure to respond in a complete sentence, being comprehensive, including all relevant background information. 
strike a friendly and converstional tone. If the passage is irrelevant to the answer, you may ignore it.

QUESTION: {query_oneline}
"""

# Add the retrieved documents to the prompt.
for passage in all_passages:
    passage_oneline = passage.replace("\n", " ")
    prompt += f"PASSAGE: {passage_oneline}\n"

print(prompt)

You are a helpful and informative bot that answers questions using text from the reference passage included below. 
Be sure to respond in a complete sentence, being comprehensive, including all relevant background information. 
strike a friendly and converstional tone. If the passage is irrelevant to the answer, you may ignore it.

QUESTION: which is the scariest movie?
 The film ends with this narration:, Casey and Geoffrey push another clown out a window to his death. Casey and Geoffrey find Randy unconscious in a closet and drag him into another room. Geoffrey is then attacked and presumably killed by the final clown, who chases Casey into the upstairs game room. Casey manages to hide for the time being, but after the clown leaves, Casey accidentally steps on a noise-making toy, alerting the clown of his presence. The enraged clown attempts to break Casey's neck, but he is then killed by Geoffrey (who survived the clown's attack), slamming a hatchet into the killer's back, and the t

use the generate_content method to to generate an answer to the question.

In [12]:
answer = client.models.generate_content(
    model="gemini-2.0-flash",
    contents=prompt)

Markdown(answer.text)

Well, according to the passage, the movie "Clownhouse" follows a boy named Casey, who is constantly influenced by his fear of clowns, and he is forced to face his fears when the clowns target his home.


Add everything in the function.

In [13]:
def get_answer(query: str) -> str:

    embed_fn.document_mode = False
    result = db.query(query_texts=[query], n_results=1)
    if result and result["documents"] and result["documents"][0]:
        [all_passages] = result["documents"]
        query_oneline = query.replace("\n", " ")

        prompt = f"""You are a helpful and informative bot that answers questions using text from the reference passage included below.
        Be sure to respond in a complete sentence, being comprehensive, including all relevant background information.
        strike a friendly and converstional tone. If the passage is irrelevant to the answer, you may ignore it.

        QUESTION: {query_oneline}
        """

        for passage in all_passages:
            passage_oneline = passage.replace("\n", " ")
            prompt += f"PASSAGE: {passage_oneline}\n"

        response = client.models.generate_content(
            model="gemini-2.0-flash",
            contents=prompt
        )
        return response.text
    else:
        return "No relevant information found in the database."


# 1. Define your query:
your_query = "which is the scariest movie?"

# 2. Call the get_answer function:
answer_text = get_answer(your_query)

# 3. Print the answer in Markdown format:
print("```markdown")
print(answer_text)
print("```")

# --- Example with another query ---
another_query = "What is the plot of a famous action movie?"
another_answer = get_answer(another_query)
print("\n```markdown")
print(another_answer)
print("```")
    

    

```markdown
Well, based on the passage, the movie "Clownhouse" tells the story of Casey, a boy with an intense fear of clowns, who must face his fears when psychotic mental patients dressed as clowns target his home, so if you are looking for a scary movie featuring clowns, this could be a good choice for you!

```

```markdown
The movie Commando follows retired United States Special Forces Colonel John Matrix, whose daughter is kidnapped by mercenaries led by a former dictator named Arius. Arius blackmails Matrix into carrying out a political assassination in Val Verde, but Matrix escapes and, with the help of a flight attendant named Cindy, sets out to rescue his daughter, Jenny, leading to a climactic showdown at Arius's island hideout where Matrix defeats the villains and saves his daughter.

```


Install dependencies for Langgraph.

In [14]:
!pip uninstall -qqy kfp jupyterlab libpysal thinc spacy fastai ydata-profiling google-cloud-bigquery google-generativeai
!pip install -qU 'langgraph==0.3.21' 'langgraph-prebuilt==0.1.7'

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.5/43.5 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.0/138.0 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.2/47.2 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m223.6/223.6 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25h

### `rag_qa_node` – Handling RAG-Based Question Answering

This function defines a node in a Retrieval-Augmented Generation (RAG) pipeline.  
It extracts the latest user query from the chat state and passes it to a `get_answer()` function to retrieve an appropriate response.  
The assistant’s answer is appended to the conversation history.  
The updated state is then returned to continue the dialogue flow.


In [15]:
def rag_qa_node(state):
    query = state["messages"][-1]["content"]
    answer = get_answer(query)  # Your existing function
    state["messages"].append({"role": "assistant", "content": answer})
    return state

### Using LangGraph

Creates a simple movie-focused chatbot using `LangGraph`. Here's what each part does:

1. **Chat State Definition**  
   A `ChatState` Pydantic model is defined to maintain the list of messages exchanged during the chat.

2. **RAG QA Node Logic**  
   The `rag_qa_node` checks if the user's query is movie-related and retrieves an answer using `get_answer()`. If no meaningful response is found, it uses a fallback message.

3. **LangGraph Flow**  
   A `StateGraph` is constructed using LangGraph. The `rag_qa_node` is registered as the sole node, serving both as the entry and end point. The app is compiled to handle chat execution.

4. **Running the Chatbot**  
   The `start_chat()` function starts an interactive session, where user queries are sent to the LangGraph-powered assistant. It continues until the user types `exit` or `quit`.


In [16]:
from langgraph.graph import StateGraph, END
from pydantic import BaseModel
from typing import List, Dict
from IPython.display import Image, display


# Define the ChatState model
class ChatState(BaseModel):
    messages: List[Dict[str, str]]

# Fallback function for unanswered questions
def fallback():
    return "Sorry, My knowledge is limited to movie plots only. Could you ask something else?"

# RAG QA Node that checks if the question is movie-related
def rag_qa_node(state: ChatState) -> ChatState:
    if not state.messages:
        return state  # Return the state as is if no messages exist

    # Extract the query (the last user message)
    query = state.messages[-1]["content"]
    
    # Placeholder for the answer (get_answer function should be defined elsewhere)
    answer = get_answer(query)
    
    # If no answer found or the answer is generic, fallback to default response
    if not answer or "no information" in answer.lower():
        answer = fallback()
    
    # Add the assistant's response to the message list
    updated_messages = state.messages + [{"role": "assistant", "content": answer}]
    return ChatState(messages=updated_messages)

# Initialize LangGraph StateGraph with ChatState as the schema
graph = StateGraph(ChatState)

# Add the rag_qa_node as a node in the graph
graph.add_node("RAG_QA", rag_qa_node)

# Set the entry point and finish point for the graph
graph.set_entry_point("RAG_QA")
graph.set_finish_point("RAG_QA")

# Compile the app
app = graph.compile()

# Function to start the chat
def start_chat():
    # Initialize the initial state with no messages
    state = ChatState(messages=[])

    print("🎬 MovieBot is ready! Ask me about any movie plot. Type 'exit' to quit.")

    while True:
        # Get user input
        user_input = input("You: ")
        
        if user_input.lower() in ["exit", "quit"]:
            print("👋 Bye!")
            break

        # Update the state with the user's message
        state_dict = {"messages": state.messages + [{"role": "user", "content": user_input}]}

        # Invoke the graph with the updated state
        result = app.invoke(state_dict)

        # Print the assistant's response
        print("Bot:", result["messages"][-1]["content"]) 

        # Update the state to include the new assistant's message
        state = ChatState(messages=result["messages"])


if __name__ == "__main__":
    start_chat()


🎬 MovieBot is ready! Ask me about any movie plot. Type 'exit' to quit.


You:  hi can u tell me about movie Bait?


Bot: Certainly! The movie *Bait* is about a middle-aged man named Marko, played by Haas, who has been searching for a lost gold mine for nearly 20 years, and to share expenses for a prospecting expedition he teams up with a bright young man, Ray Brighton, played by Agar. After they find the mine, Marko decides he doesn't want to share with his partner and plans to murder him by spending the winter together in a shack far from civilization with Marko's trashy young wife, played by Moore, so he can catch them in adultery and use the "unwritten law" to kill Brighton and escape punishment, but the plot backfires.



You:  quit


👋 Bye!
