In [6]:
from tmdbv3api import TMDb, Movie, Discover
import pandas as pd
import time

# Initialize TMDB API
tmdb = TMDb()
tmdb.api_key = '5d805336681cf24af0e0a0be49aa4f2f'

# TMDB objects
movie_api = Movie()
discover_api = Discover()

# Function to get movie details, including cast, crew, and keywords
def get_movie_details(movie_id):
    try:
        # Fetch movie details
        movie = movie_api.details(movie_id)
        
        # Fetch credits (cast and crew)
        credits = movie_api.credits(movie_id)
        
        # Fetch keywords
        keywords = movie_api.keywords(movie_id)
        
        # Extract relevant details
        movie_data = {
            'id': movie.id,
            'title': movie.title,
            'language': movie.original_language,
            'overview': movie.overview,
            'release_date': movie.release_date,
            'popularity': movie.popularity,
            'vote_average': movie.vote_average,
            'vote_count': movie.vote_count,
            'crew': credits['crew'] if 'crew' in credits else [],
            'cast': credits['cast'] if 'cast' in credits else [],
            'keywords': [kw['name'] for kw in keywords['keywords']] if 'keywords' in keywords else []
        }
        return movie_data
    except Exception as e:
        print(f"Error fetching movie {movie_id}: {e}")
        return None

# Function to get movies by language
def get_movies_by_language(language_code, total_movies):
    movies_data = []
    page = 1
    count = 0
    while count < total_movies:
        try:
            # Discover movies by language, paginated
            movies = discover_api.discover_movies({
                "with_original_language": language_code,
                "page": page,
                "sort_by": "popularity.desc"
            })
            
            # Fetch details for each movie
            for movie in movies:
                if count >= total_movies:
                    break
                movie_details = get_movie_details(movie.id)
                if movie_details:
                    movies_data.append(movie_details)
                    count += 1
                    #print(f"Fetched movie {count}/{total_movies} in {language_code}: {movie.title}")
            page += 1
            time.sleep(0.25)  # To avoid hitting rate limits
        except Exception as e:
            print(f"Error on page {page} for {language_code}: {e}")
            break
    return movies_data

# Fetch 10,000 English and 2,000 Hindi movies
english_movies = get_movies_by_language('en', 10000)
hindi_movies = get_movies_by_language('hi', 2000)

# Combine English and Hindi movies
all_movies = english_movies + hindi_movies

# Convert to a DataFrame
combined_df = pd.DataFrame(all_movies)

# Save the combined dataset to CSV
combined_df.to_csv('movies_df.csv', index=False)

print("Combined movie dataset saved successfully.")


Error fetching movie 957: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/957?api_key=5d805336681cf24af0e0a0be49aa4f2f&append_to_response=videos,trailers,images,casts,translations,keywords,release_dates&language=en-US (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000022A642CB4F0>: Failed to establish a new connection: [WinError 10055] An operation on a socket could not be performed because the system lacked sufficient buffer space or because a queue was full'))
Error fetching movie 1187107: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/1187107?api_key=5d805336681cf24af0e0a0be49aa4f2f&append_to_response=videos,trailers,images,casts,translations,keywords,release_dates&language=en-US (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000022A64327CD0>: Failed to establish a new connection: [WinError 10055] An operation on a soc

In [7]:
len(combined_df)

5343

In [8]:
combined_df

Unnamed: 0,id,title,language,overview,release_date,popularity,vote_average,vote_count,crew,cast,keywords
0,533535,Deadpool & Wolverine,en,A listless Wade Wilson toils away in civilian ...,2024-07-24,5099.804,7.700,4056,"[[adult, gender, id, known_for_department, nam...","[[adult, gender, id, known_for_department, nam...",[]
1,519182,Despicable Me 4,en,"Gru and Lucy and their girls—Margo, Edith and ...",2024-06-20,2524.611,7.142,1895,"[[adult, gender, id, known_for_department, nam...","[[adult, gender, id, known_for_department, nam...",[]
2,1114513,Speak No Evil,en,When an American family is invited to spend th...,2024-09-11,2281.291,7.343,524,"[[adult, gender, id, known_for_department, nam...","[[adult, gender, id, known_for_department, nam...",[]
3,877817,Wolfs,en,"Hired to cover up a high-profile crime, a fixe...",2024-09-20,1693.793,6.594,383,"[[adult, gender, id, known_for_department, nam...","[[adult, gender, id, known_for_department, nam...",[]
4,957452,The Crow,en,Soulmates Eric and Shelly are brutally murdere...,2024-08-21,1690.146,5.441,486,"[[adult, gender, id, known_for_department, nam...","[[adult, gender, id, known_for_department, nam...",[]
...,...,...,...,...,...,...,...,...,...,...,...
5338,3602,I'll Always Know What You Did Last Summer,en,Several teenagers in a small-town in Colorado ...,2006-06-24,19.865,4.201,376,"[[adult, gender, id, known_for_department, nam...","[[adult, gender, id, known_for_department, nam...",[]
5339,38321,Priest,en,"In an alternate world, humanity and vampires h...",2011-05-05,19.860,5.797,2228,"[[adult, gender, id, known_for_department, nam...","[[adult, gender, id, known_for_department, nam...",[]
5340,10371,For Richer or Poorer,en,"Brad Sexton and his wife, Caroline, are wealth...",1997-12-11,19.860,5.942,224,"[[adult, gender, id, known_for_department, nam...","[[adult, gender, id, known_for_department, nam...",[]
5341,17058,Scarlet Street,en,Cashier and part-time starving artist Christop...,1945-12-25,19.859,7.600,374,"[[adult, gender, id, known_for_department, nam...","[[adult, gender, id, known_for_department, nam...",[]


In [13]:
combined_df['crew'][0]

[{'adult': False, 'gender': 2, 'id': 17825, 'known_for_department': 'Directing', 'name': 'Shawn Levy', 'original_name': 'Shawn Levy', 'popularity': 23.335, 'profile_path': '/j1CXZgmfvFeD7S3PYtsEk8H3ebB.jpg', 'credit_id': '622bc4c8a579f9006f1f0a6d', 'department': 'Directing', 'job': 'Director'}, {'adult': False, 'gender': 2, 'id': 7932, 'known_for_department': 'Writing', 'name': 'Rhett Reese', 'original_name': 'Rhett Reese', 'popularity': 6.077, 'profile_path': '/8QjgT3ffjzSTfih5C0LpFEea5Ps.jpg', 'credit_id': '622be2de9a358d0071934f94', 'department': 'Writing', 'job': 'Writer'}, {'adult': False, 'gender': 2, 'id': 91269, 'known_for_department': 'Writing', 'name': 'Paul Wernick', 'original_name': 'Paul Wernick', 'popularity': 8.0, 'profile_path': '/12wCVgUkLv7RejadXXZrtL8Tj5N.jpg', 'credit_id': '622be2ea24f2ce001d7e0365', 'department': 'Writing', 'job': 'Writer'}, {'adult': False, 'gender': 2, 'id': 24192, 'known_for_department': 'Sound', 'name': 'Dave Jordan', 'original_name': 'Dave Jor