In [1]:
import os, time, json
import tmdbsimple as tmdb 
import pandas as pd
from tqdm.notebook import tqdm_notebook

FOLDER = "Data/"
os.makedirs(FOLDER, exist_ok=True)
os.listdir(FOLDER)

['.DS_Store',
 'title.basics.tsv.gz',
 'tmdb_api_results_2001.json',
 'final_tmdb_data_2002.csv.gz',
 'title.ratings.tsv.gz',
 'tmdb_api_results_2002.json',
 'tmdb_api_results_[2001, 2002].json',
 'title-akas-us-only.csv',
 '.ipynb_checkpoints']

In [2]:
def write_json(new_data, filename): 
    """Appends a list of records (new_data) to a json file (filename). 
    Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""  
    
    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        ## Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file)

In [3]:
import requests

def imdb_to_tmdb(imdb_id):
    mapping_url = f'https://api.themoviedb.org/3/find/{imdb_id}?api_key=YOUR_API_KEY&external_source=imdb_id'
    response = requests.get(mapping_url)
    data = response.json()

    tmdb_id = data.get('movie_results', [{}])[0].get('id')

    return tmdb_id

def get_movie_info(imdb_id):
    tmdb_id = imdb_to_tmdb(imdb_id)
    
imdb_id_avengers = 'tt0848228'
imdb_id_notebook = 'tt0332280'

movie_info_avengers = get_movie_info(imdb_id_avengers)
movie_info_notebook = get_movie_info(imdb_id_notebook)

print("The Avengers Movie Info:")
print(movie_info_avengers)

print("\nThe Notebook Movie Info:")
print(movie_info_notebook)

The Avengers Movie Info:
None

The Notebook Movie Info:
None


In [4]:
basics = pd.read_csv("Data/title.basics.tsv.gz", sep='\t', low_memory=False)
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...
10017006,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2009,\N,\N,"Action,Drama,Family"
10017007,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010,\N,\N,"Action,Drama,Family"
10017008,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0,2010,\N,\N,"Action,Drama,Family"
10017009,tt9916856,short,The Wind,The Wind,0,2015,\N,27,Short


In [5]:
YEARS_TO_GET = [2001, 2002]

In [6]:
errors = [ ]

In [7]:
JSON_FILE = f'{FOLDER}tmdb_api_results_{YEARS_TO_GET}.json'
file_exists = os.path.isfile(JSON_FILE)

In [8]:
if not file_exists:
    print(f"Creating {JSON_FILE} for API results for {YEAR}.")
    with open(JSON_FILE,'w') as f:
        json.dump([{'imdb_id': 0}], f)

In [9]:
df = basics.loc[basics['startYear'].isin(YEARS_TO_GET)].copy()

movie_ids = df['tconst']

In [10]:
previous_df = pd.read_json(JSON_FILE)

In [11]:
YEARS_TO_GET = [2001, 2002]

for YEAR in tqdm_notebook(YEARS_TO_GET, desc='YEARS', position=0):
    JSON_FILE = f'{FOLDER}tmdb_api_results_{YEAR}.json'
    file_exists = os.path.isfile(JSON_FILE)

    if not file_exists:
        # Print a message indicating the file is being created 
        print(f"Creating {JSON_FILE} for API results for {YEAR}.")
        # Save an empty dict with just "imdb_id" to the new json file.
        with open(JSON_FILE, 'w') as f:
            json.dump([{'imdb_id': 0}], f)

    # Your code to process existing_data goes here
    with open(JSON_FILE, 'r') as json_file:
        existing_data = json.load(json_file)

YEARS:   0%|          | 0/2 [00:00<?, ?it/s]

In [12]:
try:
    # Retrieve then data for the movie id
    temp = get_movie_with_rating(movie_ids)  
    # Append/extend results to existing file using a pre-made function
    write_json(temp,JSON_FILE)
    # Short 20 ms sleep to prevent overwhelming server
    time.sleep(0.02)
    
except Exception as e:
    errors.append([movie_ids, e])

In [13]:
print(f"- Total errors: {len(errors)}")

- Total errors: 1


In [None]:
import glob

tmdb_files = sorted(glob.glob("Data/final_tmdb_data*.csv.gz"))
tmdb_files

In [None]:
df = pd.concat([pd.read_csv(f, lineterminator='\n') for f in files] )