# Create plots data from OMDB, join all movie metadata to both plots and reviews data

In [21]:
import os
from path import Path

In [46]:
# fetch_movie_plots.py
# Creates a separate plot file without modifying RT data

import pandas as pd
import requests
import time
import json
from typing import Dict, Optional

class MoviePlotFetcher:
    def __init__(self, api_key: str) -> None:
        self.base_url = "http://www.omdbapi.com/" + api_key
        
    def get_movie_plot(self, title: str, year: str | int) -> Dict:
        """Fetch full plot and metadata from OMDb"""

        params = {
            't': title.strip(),
            'plot': 'full',  # Get full plot, not short
            'type': 'movie',
            'y': year,
        }
        
        try:
            response = requests.get(self.base_url, params=params)
            if response.status_code == 200:
                data = response.json()
                
                if data.get('Response') == 'True':
                    return {
                        'found': True,
                        'plot': data.get('Plot', ''),
                        'imdb_rating': data.get('imdbRating', ''),
                        'imdb_id': data.get('imdbID', ''),
                        'awards': data.get('Awards', ''),
                        'box_office': data.get('BoxOffice', ''),
                        'plot_length': len(data.get('Plot', ''))
                    }
        except Exception as e:
            print(f"Error fetching {title}: {e}")
        
        return {'found': False}

In [53]:
def create_plot_dataset(path: Path, rt_file_name: str, api_key: str, sample_size: Optional[int] = None) -> pd.DataFrame:
    """Create a separate dataset with movie plots"""
    
    rt_data = pd.read_csv(path / rt_file_name)
    
    # Get unique movies
    unique_movies = rt_data[['rotten_tomatoes_link', 'movie_title', 'original_release_date']].drop_duplicates().reset_index(drop=True)
    
    print(f"Fetching plots for {len(unique_movies)} movies...")
    
    fetcher = MoviePlotFetcher(api_key)
    plot_data = []
    
    for idx, row in unique_movies.iterrows():
        movie_data = {
            'rotten_tomatoes_link': row['rotten_tomatoes_link'],
            'movie_title': row['movie_title'],
            'original_release_date': row.get('original_release_date')
        }
        
        # Get plot from OMDb. Skip the movie if no plot found
        omdb_data = fetcher.get_movie_plot(
            row['movie_title'], 
            pd.to_datetime(row['original_release_date']).year
        )
        if not omdb_data['found']:
            continue
        
        # Combine data
        movie_data.update(omdb_data)
        plot_data.append(movie_data)
        # Progress update
        if (idx + 1) % 100 == 0:
            print(f"Processed {idx + 1}/{len(unique_movies)} movies")
    
    # Create final dataframe
    plots_df = pd.DataFrame(plot_data)
    
    # Statistics
    print("\nPlot Dataset Statistics:")
    print(f"Total movies processed: {len(unique_movies)}")
    print(f"Movies with plots found: {len(plots_df)}")
    print(f"Average plot length: {plots_df['plot_length'].mean():.0f} chars")
    print(f"Max plot length: {plots_df['plot_length'].max()} chars")
    print(f"Movies with long plots (>500 chars): {(plots_df['plot_length'] > 500).sum()}")
    
    # Save the plot dataset
    plots_df.to_csv(path / 'movie_plots.csv', index=False)
    print(f"\nSaved to data/processed/movie_plots.csv")
    
    return plots_df

In [54]:
# Step 1: Fetch plots (creates separate file)

plots_df = create_plot_dataset(
    path = Path('/Users/saghar/Desktop/my-project-2025/datasets/rotten-tomatoes-reviews/prep'),
    rt_file_name='reviews_w_movies_full.csv',
    api_key=os.getenv('OMDB_API_KEY'),
)

Fetching plots for 8075 movies...
Processed 100/8075 movies
Processed 200/8075 movies
Processed 500/8075 movies
Processed 600/8075 movies
Processed 700/8075 movies
Processed 800/8075 movies
Processed 1100/8075 movies
Processed 1300/8075 movies
Processed 1400/8075 movies
Processed 1500/8075 movies
Processed 1600/8075 movies
Processed 1700/8075 movies
Processed 1900/8075 movies
Processed 2100/8075 movies
Processed 2200/8075 movies
Processed 2300/8075 movies
Processed 2400/8075 movies
Processed 2600/8075 movies
Processed 2700/8075 movies
Processed 2800/8075 movies
Processed 2900/8075 movies
Processed 3000/8075 movies
Processed 3100/8075 movies
Processed 3200/8075 movies
Processed 3300/8075 movies
Processed 3400/8075 movies
Processed 3500/8075 movies
Processed 3600/8075 movies
Processed 3700/8075 movies
Processed 3800/8075 movies
Processed 3900/8075 movies
Processed 4000/8075 movies
Processed 4200/8075 movies
Processed 4300/8075 movies
Processed 4400/8075 movies
Processed 4500/8075 movies


In [22]:
path = Path('/Users/saghar/Desktop/my-project-2025/datasets/rotten-tomatoes-reviews/prep')
rt_file_name='reviews_w_movies_full.csv'
rt_data = pd.read_csv(path / rt_file_name)

In [85]:
# Create one metadata file for movies
# Add the columns to both csv files
movie_id_cols = ['rotten_tomatoes_link', 'movie_title', 'original_release_date']
movie_metadata_cols = ['movie_info', 'critics_consensus', 'content_rating', 'genres', 'directors', 'authors', 'actors', 'streaming_release_date', 'runtime', 'production_company', 'tomatometer_status', 'tomatometer_rating', 'tomatometer_count', 'audience_status', 'audience_rating', 'audience_count', 'tomatometer_top_critics_count', 'tomatometer_fresh_critics_count', 'tomatometer_rotten_critics_count']
movie_omdb_metadata_cols = ['imdb_id', 'imdb_rating', 'awards', 'box_office']

In [56]:
review_df = pd.read_csv(path / 'reviews_w_movies_full.csv')
plot_df = pd.read_csv(path / 'movie_plots.csv')

In [72]:
# keep unique movies only to later extract movie metadata
review_df_unique_movies = review_df.drop_duplicates(subset=movie_id_cols, keep="first")

In [62]:
review_df_w_meta = review_df.merge(
        plot_df[movie_id_cols + movie_omdb_metadata_cols],
        on=movie_id_cols,
        how='left'
    )
review_df_w_meta = review_df_w_meta.sort_values(by=['rotten_tomatoes_link']).reset_index(drop=True)
print(f"length of reviews df is {len(review_df_w_meta)}")

length of reviews df is 762263


In [86]:
plot_df_w_meta = plot_df.merge(
        review_df_unique_movies[movie_id_cols + movie_metadata_cols],
        on=movie_id_cols,
        how='left'
    )
plot_df_w_meta = plot_df_w_meta.sort_values(by=['rotten_tomatoes_link']).reset_index(drop=True)
print(f"length of plot df is {len(plot_df_w_meta)}")

length of plot df is 6432


In [87]:
review_df_w_meta.to_csv(path / 'reviews_w_movies_full.csv', index=False)
plot_df_w_meta.to_csv(path / 'movie_plots.csv', index=False)
print(f"\nSaved to data/processed with all available movie metadata")


Saved to data/processed with all available movie metadata
