# Code for putting together: creating documents, chunking them, indexing and creating with FAISS versus in-memory, and testing searches with queries over all chunks (movie plots and reviews)

In [1]:
import pandas as pd
import time as tm
import numpy as np
from path import Path

from src.data.document_creators import create_plot_docs, create_review_docs
from src.data.chunk import chunk
from src.retrievers.dense_retriever import FaissDenseRetriever
from src.retrievers.in_memory_dense_retriever import InMemoryDenseRetriever
from src.retrievers.base import BaseRetriever


In [2]:
# 1. Load data
print("\n1. Loading movie data...")
path = Path('/Users/saghar/Desktop/movie-rag/datasets/rotten-tomatoes-reviews/prep')
plots_df = pd.read_csv(path / 'movie_plots.csv')
reviews_df = pd.read_csv(path / 'reviews_w_movies_full.csv')
# add a year column for date
plots_df['release_year'] = pd.to_datetime(plots_df['original_release_date']).dt.year
reviews_df['release_year'] = pd.to_datetime(reviews_df['original_release_date']).dt.year
print(f"Loaded {len(plots_df)} movies and {len(reviews_df)} reviews")

# 2. Create documents
print("\n2. Creating documents...")
movie_id_cols = ['rotten_tomatoes_link', 'movie_title', 'release_year']
text_metadata_cols = ['movie_title', 'release_year', 'directors', 'genres', 'content_rating', 'runtime', 'tomatometer_rating', 'box_office', 'awards', 'imdb_rating', 'audience_rating', 'actors']
obj_metadata_cols = ['rotten_tomatoes_link', 'movie_title', 'release_year', 'original_release_date', 'authors', 'actors', 'production_company', 'genres', 'imdb_rating', 'box_office', 'content_rating', 'runtime', 'tomatometer_rating', 'tomatometer_count', 'audience_rating', 'audience_count', 'tomatometer_top_critics_count', 'tomatometer_fresh_critics_count', 'tomatometer_rotten_critics_count']

plot_docs = create_plot_docs(plots_df, text_metadata_cols, obj_metadata_cols)
review_docs = create_review_docs(reviews_df, text_metadata_cols, obj_metadata_cols, movie_id_cols)

all_docs = plot_docs + review_docs


1. Loading movie data...
Loaded 6432 movies and 762263 reviews

2. Creating documents...
Created 6257 plot docs.
Created 8075 review docs.


In [3]:
def run(all_chunks: list[dict], retriever: BaseRetriever, test_queries: list[str], save: bool = False):

    # 5. Test searches
    print("\n5. Testing searches...")

    times = []

    for query in test_queries:
        print(f"\n{'=' * 60}")
        print(f"Query: '{query}'")
        print('=' * 60)

        start = tm.time()
        results = retriever.search(query, k=3)
        time = tm.time() - start
        times.append(time)

        for i, (chunk, distance) in enumerate(results, 1):
            print(f"\nResult {i} (distance={distance:.4f}):")
            print(f"  Movie: {chunk['metadata'].get('movie_title', 'Unknown')}")
            print(f"  Year: {chunk['metadata'].get('release_year', 'N/A')}")
            print(f"  Chunk: {chunk['metadata']['chunk_id'] + 1}/{chunk['metadata']['total_chunks']}")
            print(f"  Text: {chunk['text']}")

        print("\n" + "=" * 60)
        print("SUMMARY")
        print("=" * 60)
        print(f"Average time: {np.mean(times) * 1000:.2f}ms")
        print(f"Total chunks indexed: {len(all_chunks)}")

    if isinstance(retriever, FaissDenseRetriever) and save:
        # Save retriever
        print("\nSaving retriever...")
        retriever.save("models/dense_retriever")

        # Test loading
        print("\nTesting load...")
        new_retriever = FaissDenseRetriever()
        new_retriever.load("models/dense_retriever")

        results = new_retriever.search("dreams", k=1)
        print(f"✓ Loaded retriever works: {results[0][0]['metadata']['movie_title']}")

In [4]:
all_chunks = chunk("sentence", all_docs) # because semantic will be slow
print(f"Created {len(all_chunks)} chunks from {len(all_docs)} documents\n")


Chunking documents...
Created 172538 chunks from 14332 documents



In [9]:
faiss_retriever = FaissDenseRetriever(embedding_model="text-embedding-3-small", embedding_provider="openai")
print("\nCreating FaissDenseRetriever...")
faiss_retriever.add_documents(all_chunks[:1000])

#memory_retriever = InMemoryDenseRetriever(embedding_model="text-embedding-3-small", embedding_provider="openai")
#memory_retriever.add_documents(all_chunks[:1000])

Loading embedding model: text-embedding-3-small (provider: openai)
✓ Model loaded (dimension: 1536)
✓ FaissDenseRetriever initialized (index_type=flat)

Creating FaissDenseRetriever...
Generating embeddings for 1000 documents...
Embeddings generated
Saving index...
✓ Added 1000 documents to FAISS index
  Index size: 1000


In [6]:
test_queries = [
        "movies about dreams and reality",
        "science fiction with time travel",
        "romantic comedy in New York"
    ]

In [7]:
print(f"semantic chunking, openai embeddings\n")
print(f"running faiss retriever\n")
run(all_chunks[:1000], faiss_retriever, test_queries, save=True)

#print(f"\n\nrunning in-memory retriever\n")
#run(all_chunks, memory_retriever, test_queries)

semantic chunking, openai embeddings

running faiss retriever


5. Testing searches...

Query: 'movies about dreams and reality'

Result 1 (distance=-1.1401):
  Movie: Unconscious
  Year: 2006
  Chunk: 1/1
  Text: Movie title: Unconscious
Release year: 2006
Directors: Bradley Wigor
Genres: Art House & International, Comedy, Drama, Mystery & Suspense
Content rating: R
Runtime: 108.0
Tomatometer rating: 85.0
Box office: $68,501
Imdb rating: 6.6
Audience rating: 87.0
Actors: John Speredakos, Peter Friedman, Adam LeFevre, Jessica Almasy, Benjamin Walker, Josh Pais, Brian Tarantina, Mark Lanier, Jose Llana


Plot: Six different people all believe an unconscious and unidentified man in a hospital is their missing person. A film about being unconscious . . . or not.

Result 2 (distance=-1.1870):
  Movie: Wonderful World
  Year: 2010
  Chunk: 1/1
  Text: Movie title: Wonderful World
Release year: 2010
Directors: Joshua Goldin
Genres: Drama, Romance
Content rating: R
Runtime: 95.0
Tomatometer r

In [None]:
# this time it's all reviews
faiss_retriever = FaissDenseRetriever(embedding_model="text-embedding-3-small", embedding_provider="openai")
print("\nCreating FaissDenseRetriever...")
faiss_retriever.add_documents(all_chunks[-1000:])


In [10]:
test_queries = [
        "a fun, easy watch movie preferably romantic with Brad Pit in it",
        "scary movie with negative reviews",
        "make me laugh but keep the movie short"
    ]
print(f"semantic chunking, openai embeddings\n")
print(f"running faiss retriever\n")
run(all_chunks[:1000], faiss_retriever, test_queries)

#print(f"\n\nrunning in-memory retriever\n")
#run(all_chunks, memory_retriever, test_queries)

semantic chunking, openai embeddings

running faiss retriever


5. Testing searches...

Query: 'a fun, easy watch movie preferably romantic with Brad Pit in it'

Result 1 (distance=-1.1452):
  Movie: Zoom
  Year: 2006
  Chunk: 12/15
  Text: Review: As much fun as being bitten by radioactive spiders for 83 minutes.
Review: Want to see child superheroes? See Sky High, Spy Kids or The Incredibles. And Tim Allen is better and more curmudgeonly in the Santa Clause films
Review: It's OK entertainment for preteens, while Cox adds some slapstick and love interest as a klutzy psychiatrist.
Review: you won't miss much if you give this one a pass
Review: ...a light-hearted and mindlessly engaging time-waster.

Result 2 (distance=-1.1938):
  Movie: Your Sister's Sister
  Year: 2012
  Chunk: 14/30
  Text: Review: A funny, small-scale charmer with a trio of dynamite performances.
Review: As artificial as a Hollywood rom.com... With a different cast, a bigger budget, and some punched-up dialogue, it 

In [None]:
faiss_retriever = FaissDenseRetriever(embedding_model="all-MiniLM-L6-v2", embedding_provider="sentence-transformers")
memory_retriever = InMemoryDenseRetriever(embedding_model="all-MiniLM-L6-v2", embedding_provider="sentence-transformers")

print(f"semantic chunking, sentence-transformers embeddings\n")

run(all_chunks, faiss_retriever, test_queries)
run(all_chunks, memory_retriever, test_queries)

# todo: try different chunking strategies and openAI model