# Clean and create reviews data by joining

In [21]:
import pandas as pd
import os
from path import Path
import numpy as np

In [26]:
def prep_rotten_tomatoes_data(path: Path) -> pd.DataFrame: 
    """Read rotten tomatoes dataset, clean, join them, and save full and a sample version.
    input args:
        path (Path): the project folder path
    """
    movies_df = pd.read_csv(path / 'datasets/rotten-tomatoes-reviews/raw/rotten_tomatoes_movies.csv')
    reviews_df = pd.read_csv(path / 'datasets/rotten-tomatoes-reviews/raw/rotten_tomatoes_critic_reviews.csv')

    # drop duplicates
    movies_df = movies_df.drop_duplicates()
    reviews_df = reviews_df.drop_duplicates()
    
    # Remove reviews without content, and movies with missing information
    reviews_df = reviews_df.replace(["", " ", "None", "null", "NULL", "NaN"], np.nan)
    reviews_df = reviews_df.dropna(subset=['review_content'])
    movies_df = movies_df.replace(["", " ", "None", "null", "NULL", "NaN"], np.nan)
    movies_df = movies_df.dropna()

    # merge tables and sort by movie
    reviews_with_movies = reviews_df.merge(
        movies_df,
        on='rotten_tomatoes_link',
        how='left'
    )
    reviews_with_movies = reviews_with_movies.sort_values(by=['rotten_tomatoes_link']).reset_index(drop=True)

    # Some reviews don't have movie info
    reviews_with_movies = reviews_with_movies.replace(["", " ", "None", "null", "NULL", "NaN"], np.nan)
    reviews_with_movies = reviews_with_movies.dropna(subset=['movie_title'])
    
    # print stats
    print(f"Dataset Stats:")
    print(f"Total movies: {reviews_with_movies[['movie_title','rotten_tomatoes_link', 'original_release_date']].drop_duplicates().shape[0]:,}")
    print(f"Total reviews: {len(reviews_with_movies):,}")
    print(f"Avg reviews per movie: {len(reviews_with_movies) / reviews_with_movies[['movie_title','rotten_tomatoes_link', 'original_release_date']].drop_duplicates().shape[0]:.1f}")
    
    print(f"\nMovie columns: {movies_df.columns.tolist()}")
    print(f"Review columns: {reviews_df.columns.tolist()}")
    
    print("\nSample review:")
    print(reviews_df.sample(n=1).iloc[0])

    print("\nSample movies:")
    print(movies_df.sample(n=1).iloc[0])
    
    # Save full and sample versions
    os.makedirs(path / 'datasets/rotten-tomatoes-reviews/prep', exist_ok=True)
    reviews_with_movies.to_csv(path / 'datasets/rotten-tomatoes-reviews/prep/reviews_w_movies_full.csv', index=False)
    
    # Create sample for development (first 10K reviews)
    sample = reviews_with_movies.head(10000)
    sample.to_csv(path / 'datasets/rotten-tomatoes-reviews/prep/reviews_w_movies_sample.csv', index=False)
    
    print(f"\nSaved {len(reviews_with_movies):,} reviews to processed/")
    print(f"Created sample with 10,000 reviews for testing")
    
    return reviews_with_movies

In [27]:
reviews_with_movies = prep_rotten_tomatoes_data(Path('/Users/saghar/Desktop/movie-rag'))

Dataset Stats:
Total movies: 8,075
Total reviews: 762,263
Avg reviews per movie: 94.4

Movie columns: ['rotten_tomatoes_link', 'movie_title', 'movie_info', 'critics_consensus', 'content_rating', 'genres', 'directors', 'authors', 'actors', 'original_release_date', 'streaming_release_date', 'runtime', 'production_company', 'tomatometer_status', 'tomatometer_rating', 'tomatometer_count', 'audience_status', 'audience_rating', 'audience_count', 'tomatometer_top_critics_count', 'tomatometer_fresh_critics_count', 'tomatometer_rotten_critics_count']
Review columns: ['rotten_tomatoes_link', 'critic_name', 'top_critic', 'publisher_name', 'review_type', 'review_score', 'review_date', 'review_content']

Sample review:
rotten_tomatoes_link                                        m/after_earth
critic_name                                 Frederic and Mary Ann Brussat
top_critic                                                          False
publisher_name                                    Spirituality