In [None]:
import requests
import pandas as pd
import time

API_KEY = "30c8ea0596717c63a88684285f5afe15"

def get_genre_mapping(api_key=API_KEY, language="en-US"):
    """
    Get mapping from TMDB genre IDs to genre names.
    Returns dict like {28: "Action", 12: "Adventure", ...}
    """
    url = "https://api.themoviedb.org/3/genre/movie/list"
    params = {
        "api_key": api_key,
        "language": language
    }
    r = requests.get(url, params=params)
    r.raise_for_status()
    genres = r.json().get("genres", [])
    # build dictionary id -> name
    genre_map = {g["id"]: g["name"] for g in genres}
    return genre_map

def fetch_movies_page(min_rating=5.0,
                      min_votes=1000,
                      page=1,
                      language="en-US",
                      sort_by="vote_average.desc"):
    """
    Fetch one page of movies from TMDB /discover/movie with rating filters.
    Returns parsed JSON.
    """
    url = "https://api.themoviedb.org/3/discover/movie"

    params = {
        "api_key": API_KEY,
        "language": language,
        "sort_by": sort_by,             # sort by highest rating
        "vote_average.gte": min_rating, # min rating filter
        "vote_count.gte": min_votes,    # only well-voted movies
        "page": page
    }

    r = requests.get(url, params=params)
    r.raise_for_status()
    return r.json()

def collect_movies(min_rating=8.0,
                   min_votes=1000,
                   max_pages=5,
                   genre_map=None):
    """
    Loop through pages of /discover/movie,
    map genre_ids -> genre names,
    return list of dict rows.
    """
    all_movies = []

    for p in range(1, max_pages + 1):
        data = fetch_movies_page(
            min_rating=min_rating,
            min_votes=min_votes,
            page=p
        )

        results = data.get("results", [])
        if not results:
            break

        for m in results:
            # map genre_ids -> list of genre names
            genre_ids = m.get("genre_ids", [])
            if genre_map is not None:
                genre_names = [genre_map.get(gid, "UNKNOWN") for gid in genre_ids]
            else:
                genre_names = []

            movie_row = {
                "id": m.get("id"),
                "title": m.get("title"),
                "original_title": m.get("original_title"),
                "release_date": m.get("release_date"),
                "vote_average": m.get("vote_average"),
                "vote_count": m.get("vote_count"),
                "popularity": m.get("popularity"),
                "original_language": m.get("original_language"),
                "overview": m.get("overview"),
                "genre_ids": genre_ids,           # keep raw IDs (optional)
                "genres": genre_names,            # human-readable genres âœ…
            }
            all_movies.append(movie_row)

        # be polite to TMDB
        time.sleep(0.25)

        # stop if we've reached the actual last page
        total_pages = data.get("total_pages", max_pages)
        if p >= total_pages:
            break

    return all_movies

def build_dataframe(min_rating=5.0,
                    min_votes=1000,
                    max_pages=5):
    
    genre_map = get_genre_mapping()

    movies_list = collect_movies(
        min_rating=min_rating,
        min_votes=min_votes,
        max_pages=max_pages,
        genre_map=genre_map
    )

    df = pd.DataFrame(movies_list)
    return df

if __name__ == "__main__":
    # Example usage:
    df_movies = build_dataframe(
        min_rating=5.0,    # rating >= 8.0
        min_votes=1000,    # with at least 1000 votes
        max_pages=5        # pull first 5 pages (~100 movies)
    )

    # Show first few rows with nice columns
    print(df_movies.head())
    print("Total movies collected:", len(df_movies))
    
    # Save to CSV
    df_movies.to_csv("movies_data.csv", index=False, encoding="utf-8")


    id                     title            original_title release_date  \
0  278  The Shawshank Redemption  The Shawshank Redemption   1994-09-23   
1  238             The Godfather             The Godfather   1972-03-14   
2  240     The Godfather Part II     The Godfather Part II   1974-12-20   
3  424          Schindler's List          Schindler's List   1993-12-15   
4  389              12 Angry Men              12 Angry Men   1957-04-10   

   vote_average  vote_count  popularity original_language  \
0         8.711       29103     31.7233                en   
1         8.685       21987     32.5552                en   
2         8.571       13287     18.4702                en   
3         8.566       16808     17.7931                en   
4         8.548        9480     11.8058                en   

                                            overview        genre_ids  \
0  Imprisoned in the 1940s for the double murder ...         [18, 80]   
1  Spanning the years 1945 to 1955, 