In [1]:
import requests



In [2]:
import pandas as pd


## **STEP 1: FETCHING MOVIE IDS**

In [3]:
def fetch_movie_ids(api_key, total_pages=500):
    base_url = "https://api.themoviedb.org/3/discover/movie"
    headers = {
        "accept": "application/json",
        "Authorization": f"Bearer {api_key}"
    }

    movie_ids = []
    for page in range(1, total_pages + 1):
        params = {"page": page}
        response = requests.get(base_url, headers=headers, params=params)

        if response.status_code == 200: #If Request sucessfull then proceeds
            data = response.json()
            for result in data['results']: # Extracting movie IDs and append to the list
                movie_ids.append(result['id'])
        else:
            print(f"Failed to fetch data for page {page}: {response.status_code}")
            break  # Optionally, stop fetching if there's an error

        # Optional: print progress
        if page % 10 == 0:
            print(f"Fetched page {page}/{total_pages}")
    return movie_ids


api_key = "eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiJhZTc4MDU1NDdiNDM2MzcxNDc2NGI5N2E3N2ViMzY5ZSIsInN1YiI6IjY2NTVkNDA3MjcyZWQ0NmYzYjIxMjg4NSIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.JAxwirmIdSp0tsGQi9vMqZjom2sBUyPe5yOXf9wukZc"

movie_ids = fetch_movie_ids(api_key, total_pages=500) #Only 500 pages contains all movies data

print(f"\nTotal movie IDs fetched: {len(movie_ids)}")


Fetched page 10/500
Fetched page 20/500
Fetched page 30/500
Fetched page 40/500
Fetched page 50/500
Fetched page 60/500
Fetched page 70/500
Fetched page 80/500
Fetched page 90/500
Fetched page 100/500
Fetched page 110/500
Fetched page 120/500
Fetched page 130/500
Fetched page 140/500
Fetched page 150/500
Fetched page 160/500
Fetched page 170/500
Fetched page 180/500
Fetched page 190/500
Fetched page 200/500
Fetched page 210/500
Fetched page 220/500
Fetched page 230/500
Fetched page 240/500
Fetched page 250/500
Fetched page 260/500
Fetched page 270/500
Fetched page 280/500
Fetched page 290/500
Fetched page 300/500
Fetched page 310/500
Fetched page 320/500
Fetched page 330/500
Fetched page 340/500
Fetched page 350/500
Fetched page 360/500
Fetched page 370/500
Fetched page 380/500
Fetched page 390/500
Fetched page 400/500
Fetched page 410/500
Fetched page 420/500
Fetched page 430/500
Fetched page 440/500
Fetched page 450/500
Fetched page 460/500
Fetched page 470/500
Fetched page 480/500
F

## **STEP2: CREATING DATAFRAME FUNCTIONS ESTABLISHING IT**

In [6]:
# Define the function to create a DataFrame
def create_movie_dataframe(movie_id, movie_data_list, existing_df=None):

    def extract_genres(genres):
        return ', '.join([genre['name'] for genre in genres])

    def extract_production_companies(companies):
        return ', '.join([company['name'] for company in companies])

    def extract_production_countries(countries):
        return ', '.join([country['name'] for country in countries])

    def extract_spoken_languages(languages):
        return ', '.join([language['english_name'] for language in languages])

    formatted_movie_data = []

    for movie in movie_data_list:
        formatted_movie_data.append({
            'movie_id': movie_id,  # Add movie_id here
            'title': movie.get('title'),
            'release_date': movie.get('release_date'),
            'budget': movie.get('budget'),
            'revenue': movie.get('revenue'),
            'runtime': movie.get('runtime'),
            'genres': extract_genres(movie.get('genres', [])),    # Empty list handle cases where the 'genres' key may not exist in the dictionary,
            'homepage': movie.get('homepage'),                    # therefore setting it default value if no key found.
            'imdb_id': movie.get('imdb_id'),
            'original_language': movie.get('original_language'),
            'original_title': movie.get('original_title'),
            'overview': movie.get('overview'),
            'popularity': movie.get('popularity'),
            'production_companies': extract_production_companies(movie.get('production_companies', [])),
            'production_countries': extract_production_countries(movie.get('production_countries', [])),
            'spoken_languages': extract_spoken_languages(movie.get('spoken_languages', [])),
            'status': movie.get('status'),
            'tagline': movie.get('tagline'),
            'vote_average': movie.get('vote_average'),
            'vote_count': movie.get('vote_count'),
        })

    df = pd.DataFrame(formatted_movie_data)

    # If an existing DataFrame is provided, append the new data to it
    if existing_df is not None:
        df = pd.concat([existing_df, df], ignore_index=True)
    return df


# THIS IS MY MAIN WORKING CODE
movie_df = None
api_key = "eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiJhZTc4MDU1NDdiNDM2MzcxNDc2NGI5N2E3N2ViMzY5ZSIsInN1YiI6IjY2NTVkNDA3MjcyZWQ0NmYzYjIxMjg4NSIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.JAxwirmIdSp0tsGQi9vMqZjom2sBUyPe5yOXf9wukZc"

movies_10 = movie_ids[0:10] # movie_ids is the list I created above

for movie_id in movies_10:

    # HITTING API TO RETRIEVE MOVIE DATA

    movie_url = f"https://api.themoviedb.org/3/movie/{movie_id}"               # API START
    headers = {
        "accept": "application/json",
        "Authorization": f"Bearer {api_key}"
    }
    response = requests.get(movie_url, headers=headers)

    if response.status_code == 200:                                            # Proceed if sucessfull
        movie_data = response.json()                                           # API END
        movie_data_list = [movie_data]                                         # Convert single movie data to a list
        movie_df = create_movie_dataframe(movie_id,movie_data_list, movie_df)  # Create or update DataFrame for movie data through function we created above
    else:
        print(f"Failed to fetch movie data for ID {movie_id}: {response.status_code}")

print("Movie DataFrame Sucessfully Created")

Movie DataFrame Sucessfully Created


In [7]:
movie_df.head()

Unnamed: 0,movie_id,title,release_date,budget,revenue,runtime,genres,homepage,imdb_id,original_language,original_title,overview,popularity,production_companies,production_countries,spoken_languages,status,tagline,vote_average,vote_count
0,1156593,Your Fault,2024-12-26,0,0,120,"Romance, Drama",https://www.amazon.com/dp/B0DJ1L7ZN3,tt28510079,es,Culpa tuya,The love between Noah and Nick seems unwaverin...,7758.619,"Pokeepsie Films, Amazon MGM Studios","Spain, United States of America",Spanish,Released,Divided by family. Driven by love.,7.5,381
1,558449,Gladiator II,2024-11-05,310000000,435417355,148,"Action, Adventure, Drama",https://www.gladiator.movie,tt9218128,en,Gladiator II,Years after witnessing the death of the revere...,6685.348,"Paramount Pictures, Scott Free Productions, Lu...",United States of America,English,Released,Prepare to be entertained.,6.788,1680
2,845781,Red One,2024-10-31,250000000,182861176,124,"Action, Fantasy, Comedy",https://www.amazon.com/salp/redonemovie,tt14948432,en,Red One,After Santa Claus (codename: Red One) is kidna...,4171.329,"Seven Bucks Productions, The Detective Agency,...",United States of America,English,Released,The mission to save Christmas is on.,7.029,1627
3,912649,Venom: The Last Dance,2024-10-22,120000000,476391878,109,"Action, Science Fiction, Adventure, Thriller",https://venom.movie,tt16366836,en,Venom: The Last Dance,Eddie and Venom are on the run. Hunted by both...,3569.507,"Columbia Pictures, Pascal Pictures, Matt Tolma...",United States of America,English,Released,'Til death do they part.,6.807,1990
4,939243,Sonic the Hedgehog 3,2024-12-19,122000000,211552146,110,"Action, Science Fiction, Comedy, Family",https://www.sonicthehedgehogmovie.com,tt18259086,en,Sonic the Hedgehog 3,"Sonic, Knuckles, and Tails reunite against a p...",3713.665,"Paramount Pictures, Original Film, Marza Anima...","Japan, United States of America","English, French, Spanish",Released,New adventure. New rival.,7.687,219


In [8]:
# Exporting dataframes to csv
movie_df.to_csv('movie_dataframe.csv', index=False)

**PARALLEL CODE**

In [9]:
import requests
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed

# Define the function to create a DataFrame
def create_movie_dataframe(movie_data_list, existing_df=None):
    def extract_genres(genres):
        return ', '.join([genre['name'] for genre in genres])

    def extract_production_companies(companies):
        return ', '.join([company['name'] for company in companies])

    def extract_production_countries(countries):
        return ', '.join([country['name'] for country in countries])

    def extract_spoken_languages(languages):
        return ', '.join([language['english_name'] for language in languages])

    formatted_movie_data = []

    for movie in movie_data_list:
        formatted_movie_data.append({
            'movie_id': movie['id'],  # Use movie['id'] from the API response
            'title': movie.get('title'),
            'release_date': movie.get('release_date'),
            'budget': movie.get('budget'),
            'revenue': movie.get('revenue'),
            'runtime': movie.get('runtime'),
            'genres': extract_genres(movie.get('genres', [])),
            'homepage': movie.get('homepage'),
            'imdb_id': movie.get('imdb_id'),
            'original_language': movie.get('original_language'),
            'original_title': movie.get('original_title'),
            'overview': movie.get('overview'),
            'popularity': movie.get('popularity'),
            'production_companies': extract_production_companies(movie.get('production_companies', [])),
            'production_countries': extract_production_countries(movie.get('production_countries', [])),
            'spoken_languages': extract_spoken_languages(movie.get('spoken_languages', [])),
            'status': movie.get('status'),
            'tagline': movie.get('tagline'),
            'vote_average': movie.get('vote_average'),
            'vote_count': movie.get('vote_count'),
        })

    df = pd.DataFrame(formatted_movie_data)

    # If an existing DataFrame is provided, append the new data to it
    if existing_df is not None:
        df = pd.concat([existing_df, df], ignore_index=True)
    return df

# Function to fetch movie data
def fetch_movie_data(movie_id, api_key):
    movie_url = f"https://api.themoviedb.org/3/movie/{movie_id}"
    headers = {
        "accept": "application/json",
        "Authorization": f"Bearer {api_key}"
    }
    response = requests.get(movie_url, headers=headers)
    
    if response.status_code == 200:
        return response.json()  # Return movie data if successful
    else:
        print(f"Failed to fetch movie data for ID {movie_id}: {response.status_code}")
        return None

# Main code to fetch movie data in parallel
api_key = "your_api_key_here"  # Replace with your actual API key
movies_10 = movie_ids[0:10]  # Assuming movie_ids is defined

movie_df = None

with ThreadPoolExecutor(max_workers=10) as executor:
    future_to_movie_id = {executor.submit(fetch_movie_data, movie_id, api_key): movie_id for movie_id in movies_10}

    movie_data_list = []
    
    for future in as_completed(future_to_movie_id):
        movie_id = future_to_movie_id[future]
        movie_data = future.result()
        if movie_data:
            movie_data_list.append(movie_data)

    # Create or update DataFrame for movie data
    movie_df = create_movie_dataframe(movie_data_list, movie_df)

print("Movie DataFrame Successfully Created")

Failed to fetch movie data for ID 1005331: 401
Failed to fetch movie data for ID 912649: 401
Failed to fetch movie data for ID 1043905: 401
Failed to fetch movie data for ID 845781: 401
Failed to fetch movie data for ID 558449: 401
Failed to fetch movie data for ID 762509: 401
Failed to fetch movie data for ID 1156593: 401
Failed to fetch movie data for ID 1241982: 401
Failed to fetch movie data for ID 1010581: 401
Failed to fetch movie data for ID 939243: 401
Movie DataFrame Successfully Created
