In [122]:
import pandas as pd
import json

from themoviedb import TMDb
import dataclasses

from requests import HTTPError

In [123]:
# helper functions

def castSubset(cast_data):
# cast_data is a list pulled from TMDb credits
# return a set with the 3 most relevant actors in the cast

    sorted_cast = sorted(cast_data, key = lambda x: x.order)
    
    return [x.name for x in sorted_cast[:3]]

def genresList(genres_data):
# return a list containing the genres of the movie object
        
    genres = [genre.name for genre in genres_data]
    
    return genres

def languageIso(spoken_languages):
# return a list containing the iso codes of the spoken languages

    language_isos = [language.iso_639_1 for language in spoken_languages]
    
    return language_isos
    
    

In [124]:
# read API key to access TMDb 
with open("TMDb_key.txt") as my_key:
    api_key = my_key.read()

In [125]:
try:
    #open file "r"
    #parse file collect ids of already collected movies
    #return set of ids

    with open("../datasets/tmdb/movie_db", "r") as movies_file:
        movies_entries = movies_file.readlines()
        pulled_ids = set()
    
        for line in movies_entries:
            movie_id = json.loads(line)["id"]
            pulled_ids.add(movie_id)
            
except FileNotFoundError:
    pulled_ids = set()

In [None]:
with open("../datasets/tmdb/movie_db", "a") as outfile:
    
    with open("../datasets/tmdb/movie_ids_03_06_2024.json", "r") as ids_file:
        lines = ids_file.readlines()
    
    for line in lines:
        tmdb_id = json.loads(line)["id"]
        
        # check if the movie id has already been used
        if tmdb_id in pulled_ids:
            continue
            
        try:
            #get example from tmdb by tmdb_id using the API
            movie_tmdb_data = tmdb.movie(movie_id = tmdb_id).details(append_to_response = "credits")
            
        except HTTPError:
            continue
            
        else:
            #get info from the movie object: movie title, summary, rating, cast
            title = movie_tmdb_data.title
            summary = movie_tmdb_data.overview
            genres = genresList(movie_tmdb_data.genres)
            spoken_languages = languageIso(movie_tmdb_data.spoken_languages)
            rating = movie_tmdb_data.vote_average
            cast = castSubset(movie_tmdb_data.credits.cast)
            
            #dictionary with relevant feature
            movie_ex = {"id": tmdb_id, 
                        "title": title, 
                        "summary": summary,
                        "genres": genres,
                        "spoken_languages": spoken_languages,
                        "cast": cast, 
                        "rating": rating}
            
            #dump to file
            outfile.write(json.dumps(movie_ex) + "\n")
            
            pulled_ids.add(tmdb_id)
    

In [126]:
pulled_ids

set()

In [60]:
# initialize a TMDb object and set the API key
tmdb = TMDb(key = api_key)

In [111]:
movie = tmdb.movie(movie_id = 705996).details(append_to_response = "credits")
movie

Movie(id=705996, poster_path='/N0rskx91Eh6aWjvBybeY6epNic.jpg', adult=False, overview='From a mountain peak in South Korea, a man plummets to his death. Did he jump, or was he pushed? When detective Hae-joon arrives on the scene, he begins to suspect the dead man’s wife Seo-rae. But as he digs deeper into the investigation, he finds himself trapped in a web of deception and desire.', release_date=datetime.date(2022, 6, 29), genre_ids=None, original_title='헤어질 결심', original_language='ko', title='Decision to Leave', backdrop_path='/A1bWhTFQKkhF1yhSKWosSyzn2Hp.jpg', popularity=64.412, vote_count=1040, video=False, vote_average=7.4, media_type=<MediaType.movie: 'movie'>, belongs_to_collection=None, budget=0, genres=[Genre(id=53, name='Thriller'), Genre(id=9648, name='Mystery'), Genre(id=10749, name='Romance')], homepage='', imdb_id='tt12477480', production_companies=[Company(id=34725, logo_path='/cRqXC6Dy6AvTGS2wiMt26yGJgZA.png', name='Moho Film', description=None, headquarters=None, homep

In [112]:
movie.overview

'From a mountain peak in South Korea, a man plummets to his death. Did he jump, or was he pushed? When detective Hae-joon arrives on the scene, he begins to suspect the dead man’s wife Seo-rae. But as he digs deeper into the investigation, he finds himself trapped in a web of deception and desire.'

In [113]:
movie.original_language

'ko'

In [117]:
movie.spoken_languages

[Language(english_name='English', iso_639_1='en', name='English'),
 Language(english_name='Mandarin', iso_639_1='zh', name='普通话'),
 Language(english_name='Korean', iso_639_1='ko', name='한국어/조선말')]

In [120]:
languageIso(movie.spoken_languages)

['en', 'zh', 'ko']

In [92]:
for field in dataclasses.fields(movie):
    print(field.name)

id
poster_path
adult
overview
release_date
genre_ids
original_title
original_language
title
backdrop_path
popularity
vote_count
video
vote_average
media_type
belongs_to_collection
budget
genres
homepage
imdb_id
production_companies
production_countries
revenue
runtime
spoken_languages
status
tagline
alternative_titles
credits
external_ids
images
keywords
recommendations
release_dates
reviews
similar
translations
videos
watch_providers
