# Create posters data from OMDB (fetch poster and save the path), join with metadata

In [1]:
import os
from path import Path

In [2]:
import pandas as pd
import requests
import time
from pathlib import Path
from typing import Dict, Optional

class MoviePosterFetcher:
    def __init__(self, api_key: str, poster_dir: Path) -> None:
        self.base_url = "http://www.omdbapi.com/" + api_key
        self.poster_dir = poster_dir
        self.poster_dir.mkdir(parents=True, exist_ok=True)

    def get_movie_poster(self, title: str, year: str | int) -> Dict:
        """Fetch movie poster URL and minimal metadata from OMDb"""
        params = {
            "t": title.strip(),
            "type": "movie",
            "y": year,
        }

        try:
            response = requests.get(self.base_url, params=params)
            if response.status_code == 200:
                data = response.json()

                if data.get("Response") == "True":
                    
                    poster_url = data.get("Poster")
                    imdb_id = data.get("imdbID", "")

                    if poster_url and poster_url.lower() != "n/a":
                        poster_path = self.download_poster(imdb_id, poster_url)
                    else:
                        poster_path = None
                    return {
                        "found": True,
                        "poster_path": str(poster_path) if poster_path else None,
                    }
        except Exception as e:
            print(f"Error fetching {title}: {e}")

        return {"found": False}

    def download_poster(self, imdb_id: str, poster_url: str) -> Optional[Path]:
        """Download and save the poster locally."""
        try:
            response = requests.get(poster_url)
            if response.status_code == 200:
                ext = os.path.splitext(poster_url)[1] or ".jpg"
                poster_path = self.poster_dir / f"{imdb_id}{ext}"
                with open(poster_path, "wb") as f:
                    f.write(response.content)
                return poster_path
        except Exception as e:
            print(f"Error downloading poster {imdb_id}: {e}")
        return None

In [6]:
def create_poster_dataset(
    path: Path,
    poster_dir: Path,
    rt_file_name: str,
    api_key: str,
    sample_size: Optional[int] = None
) -> pd.DataFrame:
    """Create a dataset with unique movies and poster paths"""

    rt_data = pd.read_csv(path / rt_file_name)

    # Get unique movies
    unique_movies = (
        rt_data[["rotten_tomatoes_link", "movie_title", "original_release_date"]]
        .drop_duplicates()
        .reset_index(drop=True)
    )

    if sample_size:
        unique_movies = unique_movies.sample(sample_size, random_state=42)

    print(f"Fetching posters for {len(unique_movies)} movies...")

    fetcher = MoviePosterFetcher(api_key, poster_dir)
    poster_data = []

    for idx, row in unique_movies.iterrows():
        movie_data = {
            "rotten_tomatoes_link": row["rotten_tomatoes_link"],
            "movie_title": row["movie_title"],
            "original_release_date": row.get("original_release_date"),
        }

        omdb_data = fetcher.get_movie_poster(
            row["movie_title"],
            pd.to_datetime(row["original_release_date"]).year
        )
        if not omdb_data["found"]:
            continue

        movie_data.update(omdb_data)
        poster_data.append(movie_data)

        # Progress update
        if (idx + 1) % 100 == 0:
            print(f"Processed {idx + 1}/{len(unique_movies)} movies")

    posters_df = pd.DataFrame(poster_data)

    # Statistics
    print("\nPoster Dataset Statistics:")
    print(f"Total movies processed: {len(unique_movies)}")
    print(f"Movies with posters found: {len(posters_df)}")
    
    # Save the plot dataset
    posters_df.to_csv(path / 'movie_posters.csv', index=False)
    print(f"\nSaved to data/processed/movie_posters.csv")
    
    return posters_df

In [7]:
# Step 1: Fetch posters (creates separate file)

posters_df = create_poster_dataset(
    path = Path('/Users/saghar/Desktop/movie-rag/datasets/rotten-tomatoes-reviews/prep'),
    poster_dir = Path('/Users/saghar/Desktop/movie-rag/datasets/rotten-tomatoes-reviews/prep/posters'),
    rt_file_name='reviews_w_movies_full.csv',
    api_key=os.getenv('POSTER_API_KEY'),
)

Fetching posters for 8075 movies...
Processed 100/8075 movies
Processed 200/8075 movies
Processed 500/8075 movies
Processed 600/8075 movies
Processed 700/8075 movies
Processed 800/8075 movies
Processed 1100/8075 movies
Processed 1300/8075 movies
Processed 1400/8075 movies
Processed 1500/8075 movies
Processed 1600/8075 movies
Processed 1700/8075 movies
Processed 1900/8075 movies
Processed 2100/8075 movies
Processed 2200/8075 movies
Processed 2300/8075 movies
Processed 2400/8075 movies
Processed 2600/8075 movies
Processed 2700/8075 movies
Processed 2800/8075 movies
Processed 2900/8075 movies
Processed 3000/8075 movies
Processed 3100/8075 movies
Processed 3200/8075 movies
Processed 3300/8075 movies
Processed 3400/8075 movies
Processed 3500/8075 movies
Processed 3600/8075 movies
Processed 3700/8075 movies
Processed 3800/8075 movies
Processed 3900/8075 movies
Processed 4000/8075 movies
Processed 4200/8075 movies
Processed 4300/8075 movies
Processed 4400/8075 movies
Processed 4500/8075 movie

In [8]:
# Add the metadata columns
movie_id_cols = ['rotten_tomatoes_link', 'movie_title', 'original_release_date']
movie_metadata_cols = ['imdb_id', 'imdb_rating', 'awards', 'box_office', 'movie_info', 'critics_consensus', 'content_rating', 'genres', 'directors', 'authors', 'actors', 'streaming_release_date', 'runtime', 'production_company', 'tomatometer_status', 'tomatometer_rating', 'tomatometer_count', 'audience_status', 'audience_rating', 'audience_count', 'tomatometer_top_critics_count', 'tomatometer_fresh_critics_count', 'tomatometer_rotten_critics_count']

In [9]:
path = Path('/Users/saghar/Desktop/movie-rag/datasets/rotten-tomatoes-reviews/prep')
review_df = pd.read_csv(path / 'reviews_w_movies_full.csv')
poster_df = pd.read_csv(path / 'movie_posters.csv')

In [10]:
# keep unique movies only to later extract movie metadata
review_df_unique_movies = review_df.drop_duplicates(subset=movie_id_cols, keep="first")

In [11]:
posters_df_w_meta = posters_df.merge(
        review_df_unique_movies[movie_id_cols + movie_metadata_cols],
        on=movie_id_cols,
        how='left'
    )
posters_df_w_meta = posters_df_w_meta.sort_values(by=['rotten_tomatoes_link']).reset_index(drop=True)
print(f"length of posters df is {len(posters_df_w_meta)}")

length of posters df is 6431


In [12]:
posters_df_w_meta.to_csv(path / 'movie_posters.csv', index=False)
print(f"\nSaved to data/processed with all available movie metadata")


Saved to data/processed with all available movie metadata
