In [None]:
import requests
import time
import csv

# Your TMDB API key here
API_KEY = ''

BASE_URL = 'https://api.themoviedb.org/3'

def get_movie_details(movie_id):
    url = f'{BASE_URL}/movie/{movie_id}'
    params = {
        'api_key': API_KEY,
        'language': 'en-US'
    }
    response = requests.get(url, params=params)
    return response.json()

def get_movie_reviews(movie_id):
    url = f'{BASE_URL}/movie/{movie_id}/reviews'
    params = {
        'api_key': API_KEY,
        'language': 'en-US'
    }
    response = requests.get(url, params=params)
    return response.json()

def main(movie_ids):
    movies_data = []
    reviews_data = []

    for movie_id in movie_ids:
        movie_details = get_movie_details(movie_id)

        if 'status_code' in movie_details and movie_details['status_code'] == 34:
            # Skip if the movie ID is invalid
            continue

        movie_reviews = get_movie_reviews(movie_id)

        # Extract movie details
        movie_data = {
            'id': movie_details.get('id'),
            'title': movie_details.get('title'),
            'vote_average': movie_details.get('vote_average'),
            'vote_count': movie_details.get('vote_count'),
            'budget': movie_details.get('budget'),
            'overview': movie_details.get('overview'),
            'tagline': movie_details.get('tagline'),
            'genres': [genre['name'] for genre in movie_details.get('genres', [])],
            'keywords': [keyword['name'] for keyword in movie_details.get('keywords', {}).get('keywords', [])],
            'production_companies': [company['name'] for company in movie_details.get('production_companies', [])],
            'production_countries': [country['name'] for country in movie_details.get('production_countries', [])],
            'spoken_languages': [lang['name'] for lang in movie_details.get('spoken_languages', [])]
        }
        movies_data.append(movie_data)

        # Extract user reviews and ratings
        reviews = [{
            'movie_id': movie_id,
            'author': review['author'],
            'content': review['content'],
            'rating': review['author_details'].get('rating')
        } for review in movie_reviews.get('results', [])]
        reviews_data.extend(reviews)

        # Sleep to avoid hitting the rate limit
        time.sleep(0.25)

    return movies_data, reviews_data

def get_top_movie_ids(total_movies=10000):
    movie_ids = []
    for i in range(1, (total_movies // 20) + 1):
        url = f'{BASE_URL}/movie/popular'
        params = {
            'api_key': API_KEY,
            'language': 'en-US',
            'page': i
        }
        response = requests.get(url, params=params)
        if response.status_code == 200:
            results = response.json().get('results', [])
            for movie in results:
                movie_ids.append(movie['id'])
        time.sleep(0.25)  # Respect the API rate limits
    return movie_ids

def write_to_csv(filepath, header, data):
    with open(filepath, 'w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=header)
        writer.writeheader()
        writer.writerows(data)

if __name__ == '__main__':
    # Get at least 10,000 movie IDs
    movie_ids = get_top_movie_ids(10000)

    movie_data, reviews_data = main(movie_ids)

    # Define CSV file paths
    movie_file_path = 'movies_data.csv'
    reviews_file_path = 'reviews_data.csv'

    # Write movie data to CSV
    movie_header = ['id', 'title', 'vote_average', 'vote_count', 'budget', 'overview', 'tagline', 'genres', 'keywords', 'production_companies', 'production_countries', 'spoken_languages']
    write_to_csv(movie_file_path, movie_header, movie_data)

    # Write review data to CSV
    reviews_header = ['movie_id', 'author', 'content', 'rating']
    write_to_csv(reviews_file_path, reviews_header, reviews_data)

    print(f'Movie data written to {movie_file_path}')
    print(f'Review data written to {reviews_file_path}')

Movie data written to movies_data.csv
Review data written to reviews_data.csv


In [None]:
import requests
import pandas as pd
import time

# Your TMDB API key here
API_KEY = ''

BASE_URL = 'https://api.themoviedb.org/3'

def get_movie_credits(movie_id):
    url = f'{BASE_URL}/movie/{movie_id}/credits'
    params = {
        'api_key': API_KEY,
        'language': 'en-US'
    }
    response = requests.get(url, params=params)
    if response.status_code == 200:
        return response.json()
    else:
        print(f'Error fetching credits for movie ID {movie_id}: {response.status_code}')
        return None

# Read the existing CSV movie data
df_movies = pd.read_csv('movies_data.csv')

# Define new columns for cast and director
df_movies['cast'] = None
df_movies['director'] = None

# Fetch and add cast and director information for each movie
for index, row in df_movies.iterrows():
    movie_id = row['id']
    movie_credits = get_movie_credits(movie_id)
    if movie_credits:
        cast = ', '.join([f"{cast['name']} ({cast['character']})" for cast in movie_credits.get('cast', [])])
        director = ', '.join([crew['name'] for crew in movie_credits.get('crew', []) if crew['job'] == 'Director'])
        df_movies.at[index, 'cast'] = cast
        df_movies.at[index, 'director'] = director
        print(f'Updated movie ID: {movie_id}')
    time.sleep(0.25)  # Respect the API rate limit

# Save the updated data back to the CSV
df_movies.to_csv('movies_data_updated.csv', index=False)

print('Updated movie data written to movies_data_updated.csv')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Updated movie ID: 429189
Updated movie ID: 118926
Updated movie ID: 77016
Updated movie ID: 1037858
Updated movie ID: 9032
Updated movie ID: 10246
Updated movie ID: 417870
Updated movie ID: 5725
Updated movie ID: 274181
Updated movie ID: 48340
Updated movie ID: 683340
Updated movie ID: 38765
Updated movie ID: 4970
Updated movie ID: 14534
Updated movie ID: 76489
Updated movie ID: 517814
Updated movie ID: 44727
Updated movie ID: 111972
Updated movie ID: 134411
Updated movie ID: 821669
Updated movie ID: 2309
Updated movie ID: 33875
Updated movie ID: 1122932
Updated movie ID: 11362
Updated movie ID: 10426
Updated movie ID: 246355
Updated movie ID: 15363
Updated movie ID: 11202
Updated movie ID: 3512
Updated movie ID: 979097
Updated movie ID: 887580
Updated movie ID: 477072
Updated movie ID: 16290
Updated movie ID: 269
Updated movie ID: 484247
Updated movie ID: 137106
Updated movie ID: 689723
Updated movie ID: 611914
Updated m