# Code for creating proper documents (suitable for langchain) from 1-aggregated reviews and 2-plots separately, multiple chunking strategies and comparison

In [1]:
import pandas as pd
import numpy as np
import tiktoken
from typing import List, Dict
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from path import Path
import nltk
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize
from src.data.document_creators import create_plot_docs, create_review_docs
from src.data.chunk import MovieReviewChunker, chunk

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/saghar/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
# Read data
path = Path('/Users/saghar/Desktop/movie-rag/datasets/rotten-tomatoes-reviews/prep')

plots_df = pd.read_csv(path / 'movie_plots.csv')
reviews_df = pd.read_csv(path / 'reviews_w_movies_full.csv')

# add a year column for date
plots_df['release_year'] = pd.to_datetime(plots_df['original_release_date']).dt.year
reviews_df['release_year'] = pd.to_datetime(reviews_df['original_release_date']).dt.year


In [7]:
# Create documents
movie_id_cols = ['rotten_tomatoes_link', 'movie_title', 'release_year']
text_metadata_cols = ['movie_title', 'release_year', 'directors', 'genres', 'content_rating', 'runtime', 'tomatometer_rating', 'box_office', 'awards', 'imdb_rating', 'audience_rating', 'actors']
obj_metadata_cols = ['rotten_tomatoes_link', 'movie_title', 'release_year', 'original_release_date', 'authors', 'actors', 'production_company', 'genres', 'imdb_rating', 'box_office', 'content_rating', 'runtime', 'tomatometer_rating', 'tomatometer_count', 'audience_rating', 'audience_count', 'tomatometer_top_critics_count', 'tomatometer_fresh_critics_count', 'tomatometer_rotten_critics_count']

plot_docs = create_plot_docs(plots_df, text_metadata_cols, obj_metadata_cols)
review_docs = create_review_docs(reviews_df, text_metadata_cols, obj_metadata_cols, movie_id_cols)

all_docs = plot_docs + review_docs
print(f"Total documents ready for chunking: {len(all_docs)}")

print(f"\none plot doc:\n")
print(plot_docs[0]['metadata'])
print("\n")
print(plot_docs[0]['page_content'])
print("\n\n")
print(f"one review doc:\n")
print(review_docs[0]['metadata'])
print("\n")
print(review_docs[0]['page_content'])

Created 6257 plot docs.
Created 8075 review docs.
Total documents ready for chunking: 14332

one plot doc:

{'source': 'plot', 'rotten_tomatoes_link': 'm/0814255', 'movie_title': 'Percy Jackson & the Olympians: The Lightning Thief', 'release_year': 2010, 'original_release_date': '2010-02-12', 'authors': 'Craig Titley, Chris Columbus, Rick Riordan', 'actors': "Logan Lerman, Brandon T. Jackson, Alexandra Daddario, Jake Abel, Sean Bean, Pierce Brosnan, Steve Coogan, Rosario Dawson, Melina Kanakaredes, Catherine Keener, Kevin Mckidd, Joe Pantoliano, Uma Thurman, Ray Winstone, Julian Richings, Bonita Friedericy, Annie Ilonzeh, Tania Saulnier, Marie Avgeropoulos, Luisa D'Oliveira, Christie Laing, Marielle Jaffe, Elisa King, Chrystal Tisiga, Alexis Knapp, Charlie Gallant, Chelan Simmons, Andrea Brooks, Natassia Malthe, Max Van Ville, Serinda Swan, Dimitri Lekkos, Ona Grauer, Stefanie von Pfetten, Conrad Coates, Erica Cerra, Dylan Neal, Luke Camilleri, Holly Hougham, Ina Geraldine, Raquel Risk

In [9]:
def analyze_chunks(chunks: List[Dict]) -> None:
    """Analyze chunk characteristics"""
    lengths = [len(c['text']) for c in chunks]
        
    print(f"Number of chunks: {len(chunks)}")
    print(f"Avg chunk length: {np.mean(lengths):.0f} chars")
    print(f"Min/Max length: {min(lengths)}/{max(lengths)} chars")
    print(f"Std deviation: {np.std(lengths):.0f}")
        
    # Show first chunk as example
    if chunks:
        print(f"\nExample chunk 1:")
        print(f"{chunks[0]['text']}")
        if len(chunks) > 1:
            print(f"\nExample chunk 2:")
            print(f"{chunks[1]['text']}")

In [10]:
def compare_chunk_strategies(sample_text: str, chunker: MovieReviewChunker) -> None:
    """Compare all three chunking strategies"""
    results = {}
    # Test each strategy
    print("\n" + "="*50)
    print("STRATEGY 1: Fixed-Size Token Chunks")
    print("="*50)
    fixed_chunks = chunker.chunk_fixed_size_tokens(sample_text, chunk_size=200, overlap=50)
    results['fixed'] = fixed_chunks
    analyze_chunks(fixed_chunks)
        
    print("\n" + "="*50)
    print("STRATEGY 2: Sentence-Based Chunks")
    print("="*50)
    sentence_chunks = chunker.chunk_by_sentences(sample_text, sentences_per_chunk=5)
    results['sentence'] = sentence_chunks
    analyze_chunks(sentence_chunks)
        
    print("\n" + "="*50)
    print("STRATEGY 3: Semantic Chunks")
    print("="*50)
    semantic_chunks = chunker.chunk_by_semantic_similarity(sample_text, threshold=0.7)
    results['semantic'] = semantic_chunks
    analyze_chunks(semantic_chunks)

In [11]:
# compare on long document
chunker = MovieReviewChunker()

sample_plot = next((d['page_content'] for i, d in enumerate(plot_docs) if len(d['page_content']) > 3000))
sample_review = next((d['page_content'] for i, d in enumerate(review_docs) if len(d['page_content']) > 3000))

chunker = MovieReviewChunker()
print("long plot sample\n\n")
compare_chunk_strategies(sample_plot, chunker)
print("\n\n\n")
print("long review sample\n\n")
compare_chunk_strategies(sample_review, chunker)

long plot sample



STRATEGY 1: Fixed-Size Token Chunks
Created 7 fixed-size chunks
Number of chunks: 7
Avg chunk length: 620 chars
Min/Max length: 166/784 chars
Std deviation: 192

Example chunk 1:
Movie title: Appaloosa
Release year: 2008
Directors: Ed Harris
Genres: Action & Adventure, Western
Content rating: R
Runtime: 116.0
Tomatometer rating: 76.0
Box office: $20,211,394
Awards: 5 wins total
Imdb rating: 6.7
Audience rating: 55.0
Actors: Ed Harris, Viggo Mortensen, Renée Zellweger, Jeremy Irons, Luce Rains, James Tarwater, Boyd Kestner, Gabriel Marantz, Cerris Morgan-Moyer, James Gammon, Timothy Spall, Lance Henriksen, Tom Bower, Bobby Jauregui, Ariadna Gil, Jim Tarwater, Timothy V. Murphy, Bob L. Harris, Benjamin Rosenshein, Erik J. Bockemeier, Fred Hice, Neil Summers, Tim Carroll, Bounth

Example chunk 2:
 Jauregui, Ariadna Gil, Jim Tarwater, Timothy V. Murphy, Bob L. Harris, Benjamin Rosenshein, Erik J. Bockemeier, Fred Hice, Neil Summers, Tim Carroll, Bounthanh Xaynhachack, A

In [12]:
# Compare on short document
i = min(range(len(plot_docs)), key=lambda i: len(plot_docs[i]['page_content']))
sample_plot = plot_docs[i]['page_content']
idx = min(range(len(review_docs)), key=lambda i: len(review_docs[i]['page_content']))
sample_review = review_docs[i]['page_content']

print("short plot sample\n\n")
compare_chunk_strategies(sample_plot, chunker)
print("\n\n\n")
print("short review sample\n\n")
compare_chunk_strategies(sample_review, chunker)

short plot sample



STRATEGY 1: Fixed-Size Token Chunks
Created 1 fixed-size chunks
Number of chunks: 1
Avg chunk length: 286 chars
Min/Max length: 286/286 chars
Std deviation: 0

Example chunk 1:
Movie title: Tennessee
Release year: 2009
Directors: Aaron Woodley
Genres: Drama
Content rating: R
Runtime: 99.0
Tomatometer rating: 33.0
Audience rating: 52.0
Actors: Mariah Carey, Adam Rothenberg, Ethan Peck, Lance Reddick, Bill Sage


Plot: Official music video of Back To Tennessee.

STRATEGY 2: Sentence-Based Chunks
Created 1 sentence-based chunks
Number of chunks: 1
Avg chunk length: 286 chars
Min/Max length: 286/286 chars
Std deviation: 0

Example chunk 1:
Movie title: Tennessee
Release year: 2009
Directors: Aaron Woodley
Genres: Drama
Content rating: R
Runtime: 99.0
Tomatometer rating: 33.0
Audience rating: 52.0
Actors: Mariah Carey, Adam Rothenberg, Ethan Peck, Lance Reddick, Bill Sage


Plot: Official music video of Back To Tennessee.

STRATEGY 3: Semantic Chunks
Created 1 semantic 