In [1]:
import pandas as pd

In [2]:
# Loading the datasets
movies_df = pd.read_csv('../data/tmdb_5000_movies.csv')
credits_df = pd.read_csv('../data/tmdb_5000_credits.csv')

# Merging the common column (movie title)
df = movies_df.merge(credits_df, left_on='title', right_on='title')

# Time to view the data (some of it)
df[['title', 'overview', 'cast', 'crew']].head()

Unnamed: 0,title,overview,cast,crew
0,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,Spectre,A cryptic message from Bond’s past sends him o...,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,John Carter,"John Carter is a war-weary, former military ca...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [3]:
# RAG systems don’t embed raw tables, rather they work on text chunks. 
# So we take each movie and convert its data (title, overview, cast, etc.) into a single retrievable text block like this:

# Movie: Inception\nGenres: Action, Sci-Fi\nOverview: A thief who enters dreams...\nTop Cast: Leonardo DiCaprio, Joseph Gordon-Levitt..."

# These columns (like genre) are stored as strings that look like lists of dictionaries.
df['genres'][0]

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [4]:
# Let’s parse them
import ast

# Helper to extract names from JSON-like lists
def extract_names(json_str, top_n=None):
    try:
        items = ast.literal_eval(json_str)
        names = [item['name'] for item in items]
        if top_n:
            names = names[:top_n]
        return ', '.join(names)
    except (ValueError, SyntaxError):
        return ''

In [5]:
# Applying it
df['genres_clean'] = df['genres'].apply(lambda x: extract_names(x))
df['cast_clean'] = df['cast'].apply(lambda x: extract_names(x, top_n=5))
df['crew_clean'] = df['crew'].apply(lambda x: extract_names(x, top_n=5))

In [6]:
# Let’s now convert each row into a single string block
def build_movie_document(row):
    return f"""
    Title: {row['title']}
    Genres: {row['genres_clean']}
    Overview: {row['overview']}
    Cast: {row['cast_clean']}
    Crew: {row['crew_clean']}
    """
    
df['document'] = df.apply(build_movie_document, axis=1)

In [7]:
# Time to view it
print(df['document'].iloc[0])


    Title: Avatar
    Genres: Action, Adventure, Fantasy, Science Fiction
    Overview: In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization.
    Cast: Sam Worthington, Zoe Saldana, Sigourney Weaver, Stephen Lang, Michelle Rodriguez
    Crew: Stephen E. Rivkin, Rick Carter, Christopher Boyes, Christopher Boyes, Mali Finn
    


In [8]:
# Time to do embedding
from sentence_transformers import SentenceTransformer

# Loading a pre-trained model (small & fast)
model = SentenceTransformer('all-MiniLM-L6-v2')

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
# Getting the list of movie documents
documents = df['document'].tolist()

# Generating embeddings
embeddings = model.encode(documents, show_progress_bar=True)

# documents[i] → natural-language chunk for movie i
# embeddings[i] → 384-dimensional vector representing that chunk

Batches: 100%|██████████| 151/151 [00:14<00:00, 10.51it/s]


In [10]:
# Using ChromaDb
import chromadb
from chromadb.config import Settings

client = chromadb.Client(Settings())

collection = client.create_collection(name="movies")

In [11]:
# Adding documents to the vector DB
collection.add(
    documents=documents,
    embeddings=embeddings,
    ids=[str(i) for i in range(len(documents))] 
)

In [12]:
# Now, It's time for handling the input queries
query = "What is Shutter Island About?"

# First, I will Embed the Query using the same model
query_embedding = model.encode([query])[0]

# Now, I'll Search Chroma for Relevant Documents
results = collection.query(
    query_embeddings=[query_embedding],
    n_results=3  # top 3 most similar documents
)


# The result is something like:
#   results = {
#       'documents': [['Title: Inception\nGenres: ...\nCrew: Christopher Nolan']],
#       'ids': [['42']],
#       'distances': [[0.12]]
#   }

In [13]:
# Now, I'll Extract the Retrieved Context
retrieved_context = results['documents'][0]  # list of top 3 chunks
context_text = "\n\n".join(retrieved_context)

In [None]:
# Time to give this output to an LLM
import openai

client = openai.OpenAI(
    api_key="your-key",
    base_url="https://openrouter.ai/api/v1" 
)

In [17]:
response = client.chat.completions.create(
    model="meta-llama/llama-3.3-8b-instruct:free",
    messages=[
        {"role": "system", "content": "You are a helpful movie expert."},
        {"role": "user", "content": f"""Answer the following question using the context below.

        Question: {query}

        Context:
        {context_text}
        """}
    ]
)

print(response.choices[0].message.content)

Shutter Island is a Drama, Thriller, Mystery movie about a World War II soldier-turned-U.S. Marshal named Teddy Daniels who investigates the disappearance of a patient from a hospital for the criminally insane. However, his investigation is complicated by his own troubled visions and a mysterious doctor.
