# Code for putting together: 
- creating documents
- chunking them
- indexing and creating with FAISS versus in-memory
- testing:
    -  searches with queries over all chunks (movie plots and reviews)
    -  different chunking and faiss versus in-memory
    -  dense versus sparse versus hybrid retrievers
    -  hybrid retriever with different alphas
    -  full RAG pipeline with gpt message completions

In [1]:
import pandas as pd
import time as tm
import numpy as np
from path import Path

from src.data.document_creators import create_plot_docs, create_review_docs
from src.data.chunk import chunk
from src.retrievers.dense_retriever import FaissDenseRetriever
from src.retrievers.sparse_retriever import BM25SparseRetriever
from src.retrievers.hybrid_retriever import HybridRetriever
from src.retrievers.in_memory_dense_retriever import InMemoryDenseRetriever
from src.retrievers.base import BaseRetriever


In [2]:
import random

In [3]:
# Load data
print("\n1. Loading movie data...")
path = Path('/Users/saghar/Desktop/movie-rag/datasets/rotten-tomatoes-reviews/prep')
plots_df = pd.read_csv(path / 'movie_plots.csv')
reviews_df = pd.read_csv(path / 'reviews_w_movies_full.csv')
# add a year column for date
plots_df['release_year'] = pd.to_datetime(plots_df['original_release_date']).dt.year
reviews_df['release_year'] = pd.to_datetime(reviews_df['original_release_date']).dt.year
print(f"Loaded {len(plots_df)} movies and {len(reviews_df)} reviews")

# Create documents
print("\n2. Creating documents...")
movie_id_cols = ['rotten_tomatoes_link', 'movie_title', 'release_year']
text_metadata_cols = ['movie_title', 'release_year', 'directors', 'genres', 'content_rating', 'runtime', 'tomatometer_rating', 'box_office', 'awards', 'imdb_rating', 'audience_rating', 'actors']
obj_metadata_cols = ['rotten_tomatoes_link', 'movie_title', 'release_year', 'original_release_date', 'authors', 'actors', 'production_company', 'genres', 'imdb_rating', 'box_office', 'content_rating', 'runtime', 'tomatometer_rating', 'tomatometer_count', 'audience_rating', 'audience_count', 'tomatometer_top_critics_count', 'tomatometer_fresh_critics_count', 'tomatometer_rotten_critics_count']

plot_docs = create_plot_docs(plots_df, text_metadata_cols, obj_metadata_cols)
review_docs = create_review_docs(reviews_df, text_metadata_cols, obj_metadata_cols, movie_id_cols)

all_docs = plot_docs + review_docs


1. Loading movie data...
Loaded 6432 movies and 762263 reviews

2. Creating documents...
Created 6257 plot docs.
Created 8075 review docs.


In [4]:
# Chunk
all_chunks = chunk("sentence", all_docs) # because semantic will be slow
print(f"Created {len(all_chunks)} chunks from {len(all_docs)} documents\n")


Chunking documents...
Created 172538 chunks from 14332 documents



In [5]:
# Set up retrievers (FAISS and in-memory)
faiss_retriever = FaissDenseRetriever(embedding_model="text-embedding-3-small", embedding_provider="openai")
print("\nCreating FaissDenseRetriever...")
faiss_retriever.add_documents(random.sample(all_chunks, 5000))

#memory_retriever = InMemoryDenseRetriever(embedding_model="text-embedding-3-small", embedding_provider="openai")
#memory_retriever.add_documents(random.sample(all_chunks, 1000))

Loading embedding model: text-embedding-3-small (provider: openai)
✓ Model loaded (dimension: 1536)
✓ FaissDenseRetriever initialized (index_type=flat)

Creating FaissDenseRetriever...
Generating embeddings for 5000 documents...
Embeddings generated
Saving index...
✓ Added 5000 documents to FAISS index
  Index size: 5000


In [61]:
def run(all_chunks: list[dict], retriever: BaseRetriever, test_queries: list[str], save: bool = False, k: int = 3):
    """Test search results, time, and save and load functionality"""
    print("\n Testing searches...")

    times = []

    for query in test_queries:
        print(f"\n{'=' * 60}")
        print(f"Query: '{query}'")
        print('=' * 60)

        start = tm.time()
        results = retriever.search(query, k=k)
        time = tm.time() - start
        times.append(time)

        for i, (chunk, score) in enumerate(results, 1):
            print(f"\nResult {i} (score={score:.4f}):")
            print(f"  Movie: {chunk['metadata'].get('movie_title', 'Unknown')}")
            print(f"  Year: {chunk['metadata'].get('release_year', 'N/A')}")
            print(f"  Chunk of movie: {chunk['metadata']['chunk_id'] + 1}/{chunk['metadata']['total_chunks']}")
            print(f"  Text: {chunk['text']}")

        print("\n" + "=" * 60)
        print("SUMMARY")
        print("=" * 60)
        print(f"Average time: {np.mean(times) * 1000:.2f}ms")
        print(f"Total chunks indexed: {len(all_chunks)}")

    if isinstance(retriever, FaissDenseRetriever) and save:
        # Save retriever
        print("\nSaving retriever...")
        retriever.save("models/dense_retriever")

        # Test loading
        print("\nTesting load...")
        new_retriever = FaissDenseRetriever()
        new_retriever.load("models/dense_retriever")

        results = new_retriever.search("dreams", k=1)
        print(f"✓ Loaded retriever works: {results[0][0]['metadata']['movie_title']}")

Test retrieval on some queries

In [23]:
test_queries = [
        "movies about dreams and reality",
        "science fiction with time travel",
        "romantic comedy in New York"
    ]

In [24]:
print(f"sentence chunking, openai embeddings\n")
print(f"running faiss retriever\n")
run(faiss_retriever.chunks, faiss_retriever, test_queries, save=True)

#print(f"\n\nrunning in-memory retriever\n")
#run(memory_retriever.chunks, memory_retriever, test_queries)

sentence chunking, openai embeddings

running faiss retriever


5. Testing searches...

Query: 'movies about dreams and reality'

Result 1 (score=-0.9902):
  Movie: What Dreams May Come
  Year: 1998
  Chunk of movie: 1/4
  Text: Movie title: What Dreams May Come
Release year: 1998
Directors: Vincent Ward
Genres: Drama, Science Fiction & Fantasy, Romance
Content rating: PG-13
Runtime: 113.0
Tomatometer rating: 54.0
Box office: $55,382,927
Awards: Won 1 Oscar. 7 wins & 2 nominations total
Imdb rating: 7.0
Audience rating: 84.0
Actors: Robin Williams, Cuba Gooding Jr., Annabella Sciorra, Max von Sydow, Jessica Brooks Grant, Josh Paddock, Rosalind Chao, Lucinda Jenney, Maggie McCarthy, Wilma Bonet, Matt Salinger, Carin Sprague, June Lomena, Paul P. Card IV, Werner Herzog, Clara Thomas, Benjamin Brock


Plot: During a holiday in Switzerland, a young Chris Nielsen meets Annie Collins in a lake when their boats collide. Sharing a snack a few hours later, Chris and Annie fall in love. Marrying

In [25]:
test_queries = [
        "a fun, easy watch movie preferably romantic with Brad Pit in it",
        "scary movie with negative reviews",
        "make me laugh but keep the movie short"
    ]
print(f"sentence chunking, openai embeddings\n")
print(f"running faiss retriever\n")
run(faiss_retriever.chunks, faiss_retriever, test_queries)

sentence chunking, openai embeddings

running faiss retriever


5. Testing searches...

Query: 'a fun, easy watch movie preferably romantic with Brad Pit in it'

Result 1 (score=-1.0055):
  Movie: Mr. & Mrs. Smith
  Year: 2005
  Chunk of movie: 33/43
  Text: Review: Pitt and Jolie have chemistry to burn, and the film hits more often than it misses.
Review: Palpable sexual electricity between Brad Pitt and Angelina Jolie provides Mr. and Mrs. Smith with all the power it needs to overcome a very silly plot.
Review: A braver movie would have seen the couple's standoff through to its logical conclusion -- the eventual insistence on happily ever after seems more than a little weaselly.
Review: Frisky and subversive, the movie argues that it's only after destroying icons of domesticity (and literally blowing up the McMansion) we can finally get to know one another.
Review: Hallelujah, it's fun.

Result 2 (score=-1.1197):
  Movie: Stardust
  Year: 2007
  Chunk of movie: 30/39
  Text: Review: 

__Todo__: try different chunking strategies and huggingface local models

In [None]:
#faiss_retriever = FaissDenseRetriever(embedding_model="all-MiniLM-L6-v2", embedding_provider="sentence-transformers")
#faiss_retriever.add_documents(all_chunks)
#memory_retriever = InMemoryDenseRetriever(embedding_model="all-MiniLM-L6-v2", embedding_provider="sentence-transformers")
#memory_retriever.add_documents(all_chunks)

#print(f"semantic chunking, sentence-transformers embeddings\n")

#run(all_chunks, faiss_retriever, test_queries)
#run(all_chunks, memory_retriever, test_queries)

## Now let's try hybrid retrievers!

In [55]:
# Test queries (mix of semantic and keyword-based)
test_queries = [
    "movies about artificial intelligence turning against humans",  # Semantic query
    "films where childhood friends grow apart",  # Semantic query
    "inception nolan 2010",             # Keyword query
    "T-800 Terminator model",           # Keyword query
    "sci-fi films with time travel and paradoxes",  # Hybrid query
    "romantic movies set in Paris"  # Hybrid query
]

In [71]:
# Create retrievers
print("\nCreating retrievers...")

hybrid = HybridRetriever(embedding_model="text-embedding-3-small", embedding_provider="openai", strategy="hybrid", hybrid_alpha=0.5)

# Add documents
print("\nIndexing documents...")
hybrid.add_documents(random.sample(all_chunks, 5000))


Creating retrievers...
Loading embedding model: text-embedding-3-small (provider: openai)
✓ Model loaded (dimension: 1536)
✓ FaissDenseRetriever initialized (index_type=flat)
✓ SparseRetriever initialized (k1=1.5, b=0.75)
✓ HybridRetriever initialized
  Strategy: hybrid
  Dense backend: faiss
  Hybrid alpha: 0.5
  Dense weight: 0.50
  Sparse weight: 0.50

Indexing documents...
Tokenizing 5000 documents...
Building BM25 index...
✓ Added 5000 documents to BM25 index
Generating embeddings for 5000 documents...
Embeddings generated
Saving index...
✓ Added 5000 documents to FAISS index
  Index size: 5000
✓ Added 5000 documents to HybridRetriever


In [72]:
print("="*60)
print("Demo: Hybrid Search (Dense + Sparse)")
print("="*60)

# Compare retrieval methods
print("\n Comparing retrieval methods...")

# Dense only
print("\n[Dense Only - Semantic]")
hybrid.switch_strategy("dense")
run(hybrid.chunks, hybrid, test_queries)


Demo: Hybrid Search (Dense + Sparse)

 Comparing retrieval methods...

[Dense Only - Semantic]
✓ Switched strategy from hybrid to dense

 Testing searches...

Query: 'movies about artificial intelligence turning against humans'
confirming I'm doing dense search

Result 1 (score=-1.0747):
  Movie: A.I. Artificial Intelligence
  Year: 2001
  Chunk of movie: 8/40
  Text: Review: [A] fascinating wreck.
Review: A seething psychological bonanza.
Review: A good deal of A.I. is striking and memorable, but it's a distinctly uneven picture whose reach exceeds its grasp.
Review: Involving and exasperating, stressing that the desired fusion of two spirits could, in this case at least, generate only friction.
Review: We become truly human only when we can both give and receive love: Also sprach Spielberg.

Result 2 (score=-1.0801):
  Movie: I, Robot
  Year: 2004
  Chunk of movie: 29/46
  Text: Review: The movie is at least a bit less annoying than director Alex Proyas' previous two genre attempts.


I'm impressed. Friends who grow apart -> it matched it to friendships that grow stale! Nice. -> "With Cole's forbidden relationship intensifying and his friendships unraveling, he must choose between love, loyalty, and the future he is destined for."

In [73]:
# Sparse only
print("\n[Sparse Only - BM25 Keywords]")
hybrid.switch_strategy("sparse")
run(hybrid.chunks, hybrid, test_queries)


[Sparse Only - BM25 Keywords]
✓ Switched strategy from dense to sparse

 Testing searches...

Query: 'movies about artificial intelligence turning against humans'
confirming I'm doing sparse search
['movies', 'about', 'artificial', 'intelligence', 'turning', 'against', 'humans']
[0. 0. 0. 0. 0.]

Result 1 (score=11.5401):
  Movie: Against the Ropes
  Year: 2004
  Chunk of movie: 3/26
  Text: Review: Somewhere along the way, I found myself enjoying Against the Ropes, not in spite of my better judgment, but because something in the movie defies the whole notion of better judgment altogether.
Review: If you like boxing or know a little something about the sport, prepare to have your intelligence insulted.
Review: If Against the Ropes was a boxer, it ... would be a lightweight. The movie spends more time displaying Jackie as a publicity-hungry celebrity ... than as a boxing manager.
Review: Against The Ropes is the latest sucker punch that arrives in the arena of unbearable boxing sagas c

Incredibly fast!

In [74]:
# Hybrid
print("\n[Hybrid - Combined]")
hybrid.switch_strategy("hybrid")
run(hybrid.chunks, hybrid, test_queries)


[Hybrid - Combined]
✓ Switched strategy from sparse to hybrid

 Testing searches...

Query: 'movies about artificial intelligence turning against humans'
confirming I'm doing dense search
confirming I'm doing sparse search
['movies', 'about', 'artificial', 'intelligence', 'turning', 'against', 'humans']
[0. 0. 0. 0. 0.]

Result 1 (score=5.9534):
  Movie: A.I. Artificial Intelligence
  Year: 2001
  Chunk of movie: 8/40
  Text: Review: [A] fascinating wreck.
Review: A seething psychological bonanza.
Review: A good deal of A.I. is striking and memorable, but it's a distinctly uneven picture whose reach exceeds its grasp.
Review: Involving and exasperating, stressing that the desired fusion of two spirits could, in this case at least, generate only friction.
Review: We become truly human only when we can both give and receive love: Also sprach Spielberg.

Result 2 (score=5.9507):
  Movie: I, Robot
  Year: 2004
  Chunk of movie: 29/46
  Text: Review: The movie is at least a bit less annoyi

In [76]:
# Test different alpha values
print("\n" + "="*60)
print("Testing different alpha values")
print("="*60)

query = "frienship that never dies"
alphas = [0.0, 0.3, 0.5, 0.7, 1.0]  # 0 = sparse only, 1 = dense only

for alpha_val in alphas:
    print(f"\nalpha={alpha_val} (dense={alpha_val:.1f}, sparse={1-alpha_val:.1f})")
    hybrid.set_hybrid_weight(alpha_val)
    run(hybrid.chunks, hybrid, [query])



Testing different alpha values

alpha=0.0 (dense=0.0, sparse=1.0)
✓ Set hybrid alpha to 0.0

 Testing searches...

Query: 'frienship that never dies'
confirming I'm doing dense search
confirming I'm doing sparse search
['frienship', 'that', 'never', 'dies']
[2.03966424 2.93370049 1.91611948 0.         3.21531766]

Result 1 (score=13.3378):
  Movie: Tomorrow Never Dies
  Year: 1997
  Chunk of movie: 15/19
  Text: Review: In Tomorrow Never Dies, the news is mostly good. And when it comes to movies, there's no news like good news.
Review: Good news is, Tomorrow Never Dies is highly entertaining. It will thrill novices and delight old-guard Bondphiles.
Review: Yeoh proves so much Bond's equal that they wind up sharing steering privileges on a death-defying motorcycle, her hand on the clutch and his on the brake, their other arms twined around each other. It's a pairing made in sequel heaven.
Review: In the latest James Bond, our hero saves the world from brand-name unawareness. Tomorrow N

Because it was very semantic, alpha = 0.7 worked ok. What if it's not that deep?

In [77]:
# Test different alpha values
print("\n" + "="*60)
print("Testing different alpha values")
print("="*60)

query = "scifi funny"
alphas = [0.0, 0.3, 0.5, 0.7, 1.0]  # 0 = sparse only, 1 = dense only

for alpha_val in alphas:
    print(f"\nalpha={alpha_val} (dense={alpha_val:.1f}, sparse={1-alpha_val:.1f})")
    hybrid.set_hybrid_weight(alpha_val)
    run(hybrid.chunks, hybrid, [query])



Testing different alpha values

alpha=0.0 (dense=0.0, sparse=1.0)
✓ Set hybrid alpha to 0.0

 Testing searches...

Query: 'scifi funny'
confirming I'm doing dense search
confirming I'm doing sparse search
['scifi', 'funny']
[0. 0. 0. 0. 0.]

Result 1 (score=7.1392):
  Movie: Serenity
  Year: 2005
  Chunk of movie: 9/38
  Text: Review: Whedon's six-gun space oddity comes recommended, but you may want to brush up on the series before venturing into the theater.
Review: A good old-fashioned space opera.
Review: A likeable cast with crackling chemistry, + genre mash-ups, tough/clever wisecracks, and expert seasoning of action with humor and humor with action.
Review: Serenity is a brash, funny, action-packed bit of sci-fi ecstasy.
Review: In its own unassuming, self-effacing way, Serenity is the epic sci-fi adventure that the latter years of Star Wars could only dream of being.

Result 2 (score=5.4180):
  Movie: Willard
  Year: 2003
  Chunk of movie: 2/25
  Text: Review: Willard is unlike

I still like the semantic ones better, but even alpha = 0.3 wasn't too bad.

## Now let's look at full RAG pipeline

In [6]:
def run_rag(all_chunks: list[dict], retriever: BaseRetriever, test_queries: list[str], k: int = 3):
    """RAG results, time"""
    print("\n Testing searches...")

    times = []

    for query in test_queries:
        print(f"\n{'=' * 60}")
        print(f"Query: '{query}'")
        print('=' * 60)

        start = tm.time()
        results = retriever.search(query, k=k)
        answer = retriever.generate(query, results)['answer']
        time = tm.time() - start
        times.append(time)

        for i, (chunk, score) in enumerate(results, 1):
            print(f"\nResult {i} (score={score:.4f}):")
            print(f"  Movie: {chunk['metadata'].get('movie_title', 'Unknown')}")
            print(f"  Year: {chunk['metadata'].get('release_year', 'N/A')}")
            print(f"  Chunk of movie: {chunk['metadata']['chunk_id'] + 1}/{chunk['metadata']['total_chunks']}")
            print(f"  Text: {chunk['text']}")
        print(f"\nFinal answer: {answer}")

        print("\n" + "=" * 60)
        print("SUMMARY")
        print("=" * 60)
        print(f"Average time: {np.mean(times) * 1000:.2f}ms")
        print(f"Total chunks indexed: {len(all_chunks)}")

In [7]:
test_queries = [
        "movies about dreams and reality",
        "science fiction with time travel",
        "romantic comedy in New York"
    ]

In [8]:
run_rag(faiss_retriever.chunks, faiss_retriever, test_queries)


 Testing searches...

Query: 'movies about dreams and reality'

Result 1 (score=-0.9750):
  Movie: Waking Life
  Year: 2001
  Chunk of movie: 18/30
  Text: Review: Forces a non-stop barrage of pseudo-deep theories and questions on the viewer that, more often than not, resemble a bunch of psycho-babbly hooey.
Review: The visuals are extraordinary, the kind of thing usually reserved for a 5-minute short. But we never get tired of watching it.
Review: a film for anyone who has ever wondered about life, about what it means to be human, about where the division between dreams and reality really is
Review: The overall effect of Waking Life is that of finally escaping a cocktail party full of ecstasy-laden philosophy students.
Review: ... perhaps the best approximation of what a real dream is.

Result 2 (score=-0.9902):
  Movie: What Dreams May Come
  Year: 1998
  Chunk of movie: 1/4
  Text: Movie title: What Dreams May Come
Release year: 1998
Directors: Vincent Ward
Genres: Drama, Science F