In [39]:
import pandas as pd
import json

from themoviedb import TMDb

from requests import HTTPError

In [40]:
# helper functions

def castSubset(cast_data):
# cast_data is a list pulled from TMDb credits
# return a set with the 3 most relevant actors in the cast

    sorted_cast = sorted(cast_data, key = lambda x: x.order)
    
    return [x.name for x in sorted_cast[:3]]
  

In [41]:
# read API key to access TMDb 
with open("TMDb_key.txt") as my_key:
    api_key = my_key.read()

In [42]:
try:
    #open file "r"
    #parse file collect ids of already collected movies
    #return set of ids

    with open("../datasets/tmdb/movie_db", "r") as movies_file:
        movies_entries = movies_file.readlines()
        pulled_ids = set()
    
        for line in movies_entries:
            movie_id = json.loads(line)["id"]
            pulled_ids.add(movie_id)
            
except FileNotFoundError:
    pulled_ids = set()

In [None]:
with open("../datasets/tmdb/movie_db", "a") as outfile:
    
    with open("../datasets/tmdb/movie_ids_03_06_2024.json", "r") as ids_file:
        lines = ids_file.readlines()
    
    for line in lines:
        tmdb_id = json.loads(line)["id"]
        
        # check if the movie id has already been used
        if tmdb_id in pulled_ids:
            continue
            
        try:
            #get example from tmdb by tmdb_id using the API
            movie_tmdb_data = tmdb.movie(movie_id = tmdb_id).details(append_to_response = "credits")
            
        except HTTPError:
            continue
            
        else:
            #get info from the movie object: movie title, summary, rating, cast
            title = movie_tmdb_data.title
            summary = movie_tmdb_data.overview
            rating = movie_tmdb_data.vote_average
            cast = castSubset(movie_tmdb_data.credits.cast)
            
            #dictionary with relevant feature
            movie_ex = {"id": tmdb_id, 
                        "title": title, 
                        "summary": summary, 
                        "cast": cast, 
                        "rating": rating}
            
            #dump to file
            outfile.write(json.dumps(movie_ex) + "\n")
            
            pulled_ids.add(tmdb_id)
    

In [47]:
len(pulled_ids)

246084

In [17]:
# initialize a TMDb object and set the API key
tmdb = TMDb(key = api_key)

In [19]:
movie = tmdb.movie(movie_id = 2).details(append_to_response = "credits")
movie

Movie(id=2, poster_path='/ojDg0PGvs6R9xYFodRct2kdI6wC.jpg', adult=False, overview='After the coal mine he works at closes and his father commits suicide, a Finnish man leaves for the city to make a living but there, he is framed and imprisoned for various crimes.', release_date=datetime.date(1988, 10, 21), genre_ids=None, original_title='Ariel', original_language='fi', title='Ariel', backdrop_path='/hQ4pYsIbP22TMXOUdSfC2mjWrO0.jpg', popularity=11.657, vote_count=304, video=False, vote_average=7.082, media_type=<MediaType.movie: 'movie'>, belongs_to_collection=None, budget=0, genres=[Genre(id=18, name='Drama'), Genre(id=35, name='Comedy'), Genre(id=10749, name='Romance'), Genre(id=80, name='Crime')], homepage='', imdb_id='tt0094675', production_companies=[Company(id=2303, logo_path=None, name='Villealfa Filmproductions', description=None, headquarters=None, homepage=None, origin_country='FI', parent_company=None)], production_countries=[Country(iso_3166_1='FI', name='Finland')], revenue

In [32]:
movie.title

'Ariel'

In [33]:
movie.genres

[Genre(id=18, name='Drama'),
 Genre(id=35, name='Comedy'),
 Genre(id=10749, name='Romance'),
 Genre(id=80, name='Crime')]

In [29]:
movie.vote_average

7.082

In [22]:
movie.overview

'After the coal mine he works at closes and his father commits suicide, a Finnish man leaves for the city to make a living but there, he is framed and imprisoned for various crimes.'

In [27]:
movie.credits.cast

[Cast(id=54768, adult=False, gender=2, known_for_department='Acting', name='Turo Pajala', original_name='Turo Pajala', popularity=2.378, profile_path='/b6JdzqTn6UFYf4DouHbbE8Ypk4r.jpg', cast_id=3, character='Taisto Kasurinen', credit_id='52fe420dc3a36847f8000029', order=0, total_episode_count=None, roles=None),
 Cast(id=54769, adult=False, gender=1, known_for_department='Acting', name='Susanna Haavisto', original_name='Susanna Haavisto', popularity=3.112, profile_path='/lhjoqlBFCvshchMAd2lQLWV7z7I.jpg', cast_id=4, character='Irmeli Pihlaja', credit_id='52fe420dc3a36847f800002d', order=1, total_episode_count=None, roles=None),
 Cast(id=4826, adult=False, gender=2, known_for_department='Acting', name='Matti Pellonpää', original_name='Matti Pellonpää', popularity=4.581, profile_path='/1Qzhkkp3wFE2NMGKVyOra9931q2.jpg', cast_id=5, character='Mikkonen', credit_id='52fe420dc3a36847f8000031', order=2, total_episode_count=None, roles=None),
 Cast(id=54770, adult=False, gender=2, known_for_depar

In [26]:
castSubset(movie.credits.cast)

['Turo Pajala', 'Susanna Haavisto', 'Matti Pellonpää']