In [1]:
import os, time, json
import tmdbsimple as tmdb 
import pandas as pd
from tqdm.notebook import tqdm_notebook

FOLDER = "Data/"
os.makedirs(FOLDER, exist_ok=True)
os.listdir(FOLDER)

['.DS_Store',
 'title.basics.tsv.gz',
 'final_tmdb_data_2002.csv.gz',
 'title.ratings.tsv.gz',
 'tmdb_api_results_[2001, 2002].json',
 'title-akas-us-only.csv',
 'final_tmdb_data_2001.csv.gz',
 '.ipynb_checkpoints',
 'tmdb.csv.gz']

In [2]:
basics = pd.read_csv("Data/title.basics.tsv.gz", sep='\t', low_memory=False)
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...
10017006,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2009,\N,\N,"Action,Drama,Family"
10017007,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010,\N,\N,"Action,Drama,Family"
10017008,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0,2010,\N,\N,"Action,Drama,Family"
10017009,tt9916856,short,The Wind,The Wind,0,2015,\N,27,Short


In [3]:
def write_json(new_data, filename): 
    """Appends a list of records (new_data) to a json file (filename). 
    Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""  
    
    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        ## Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file)

In [4]:
import requests

def imdb_to_tmdb(imdb_id):
    mapping_url = f'https://api.themoviedb.org/3/find/{imdb_id}?api_key=YOUR_API_KEY&external_source=imdb_id'
    response = requests.get(mapping_url)
    data = response.json()

    tmdb_id = data.get('movie_results', [{}])[0].get('id')

    return tmdb_id

def get_movie_info(imdb_id):
    tmdb_id = imdb_to_tmdb(imdb_id)
    
imdb_id_avengers = 'tt0848228'
imdb_id_notebook = 'tt0332280'

movie_info_avengers = get_movie_info(imdb_id_avengers)
movie_info_notebook = get_movie_info(imdb_id_notebook)

print("The Avengers Movie Info:")
print(movie_info_avengers)

print("\nThe Notebook Movie Info:")
print(movie_info_notebook)

The Avengers Movie Info:
None

The Notebook Movie Info:
None


In [5]:
def get_movie_with_rating(movie_id):
    # Get the movie object for the current id
    movie = tmdb.Movies(movie_id)
    # save the .info .releases dictionaries
    movie_info = movie.info()
    
    releases = movie.releases()
    # Loop through countries in releases
    for c in releases['countries']:
        # if the country abbreviation==US
        if c['iso_3166_1' ] =='US':
            ## save a "certification" key in the info dict with the certification
            movie_info['certification'] = c['certification']
    return movie_info

In [6]:
filtered_basics = basics[(basics['startYear'] == '2001') | (basics['startYear'] == '2002')]
filtered_basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
33802,tt0034413,short,Youth Gets a Break,Youth Gets a Break,0,2001,\N,20,Short
34802,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,\N,118,"Comedy,Fantasy,Romance"
92620,tt0094718,short,Beavers,Beavers,0,2002,\N,31,"Documentary,Short"
93930,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,\N,126,Drama
104785,tt0107194,short,Imagine,Imagine,0,2002,\N,22,"Documentary,Short"
...,...,...,...,...,...,...,...,...,...
10016118,tt9914870,tvEpisode,Episode #1.967,Episode #1.967,0,2001,\N,22,"Drama,Fantasy,Romance"
10016119,tt9914872,tvEpisode,Episode #1.968,Episode #1.968,0,2001,\N,22,"Drama,Fantasy,Romance"
10016122,tt9914878,tvEpisode,Episode #1.969,Episode #1.969,0,2001,\N,22,"Drama,Fantasy,Romance"
10016635,tt9916064,videoGame,AeroWings,Aero Dancing i,0,2001,\N,\N,\N


In [7]:
basics_filtered= 'Data/filtered_basics.csv.gz'
filtered_basics.to_csv(basics_filtered, index=False)

In [8]:
years_2001 = basics[(basics['startYear'] == '2001')]
years_2001

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
33802,tt0034413,short,Youth Gets a Break,Youth Gets a Break,0,2001,\N,20,Short
34802,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,\N,118,"Comedy,Fantasy,Romance"
110969,tt0113545,short,The Killing Seasons,The Killing Seasons,0,2001,\N,9,"Action,Drama,Short"
111356,tt0113946,short,Negocios,Negocios,0,2001,\N,17,Short
111842,tt0114447,movie,The Silent Force,The Silent Force,0,2001,\N,90,Action
...,...,...,...,...,...,...,...,...,...
10016116,tt9914866,tvEpisode,Episode #1.964,Episode #1.964,0,2001,\N,22,"Drama,Fantasy,Romance"
10016118,tt9914870,tvEpisode,Episode #1.967,Episode #1.967,0,2001,\N,22,"Drama,Fantasy,Romance"
10016119,tt9914872,tvEpisode,Episode #1.968,Episode #1.968,0,2001,\N,22,"Drama,Fantasy,Romance"
10016122,tt9914878,tvEpisode,Episode #1.969,Episode #1.969,0,2001,\N,22,"Drama,Fantasy,Romance"


In [9]:
years_2002 = basics[(basics['startYear'] == '2002')]
years_2002

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
92620,tt0094718,short,Beavers,Beavers,0,2002,\N,31,"Documentary,Short"
93930,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,\N,126,Drama
104785,tt0107194,short,Imagine,Imagine,0,2002,\N,22,"Documentary,Short"
106703,tt0109173,movie,Auf allen Meeren,Auf allen Meeren,0,2002,\N,95,Documentary
110356,tt0112912,movie,Dune 7,Dune 7,0,2002,\N,97,Adventure
...,...,...,...,...,...,...,...,...,...
10016013,tt9914632,tvEpisode,Episode dated 23 March 2002,Episode dated 23 March 2002,0,2002,\N,\N,Talk-Show
10016014,tt9914634,tvEpisode,Episode dated 30 March 2002,Episode dated 30 March 2002,0,2002,\N,\N,Talk-Show
10016017,tt9914640,tvEpisode,Episode dated 26 January 2002,Episode dated 26 January 2002,0,2002,\N,\N,Talk-Show
10016021,tt9914648,tvEpisode,Episode dated 12 January 2002,Episode dated 12 January 2002,0,2002,\N,\N,Talk-Show


In [10]:
file_path_2001= 'Data/final_tmdb_data_2001.csv.gz'
years_2001.to_csv(file_path_2001, index=False)

file_path_2002= 'Data/final_tmdb_data_2002.csv.gz'
years_2002.to_csv(file_path_2002, index=False)

In [11]:
YEARS_TO_GET = [2001, 2002]
errors = [ ]

for YEAR in tqdm_notebook(years_to_get, desc='YEARS', position=0):
    JSON_FILE = f'{FOLDER}tmdb_api_results_{YEARS_TO_GET}.json'
    file_exists = os.path.isfile(JSON_FILE)

    if not file_exists:
        # Print a message indicating the file is being created 
        print(f"Creating {JSON_FILE} for API results for {YEARS_TO_GET}.")
        # Save an empty dict with just "imdb_id" to the new json file.
        with open(JSON_FILE, 'w') as f:
            json.dump([{'imdb_id': 0}], f)

    # Your code to process existing_data goes here
    with open(JSON_FILE, 'r') as json_file:
        existing_data = json.load(json_file)

NameError: name 'years_to_get' is not defined

In [None]:
JSON_FILE = f'{FOLDER}tmdb_api_results_{years_to_get}.json'
file_exists = os.path.isfile(JSON_FILE)

In [None]:
if not file_exists:
    print(f"Creating {JSON_FILE} for API results for {YEARS_TO_GET}.")
    with open(JSON_FILE,'w') as f:
        json.dump([{'imdb_id': 0}], f)

In [None]:
try:
    # Retrieve then data for the movie id
    temp = get_movie_with_rating(movie_ids)  
    # Append/extend results to existing file using a pre-made function
    write_json(temp,JSON_FILE)
    # Short 20 ms sleep to prevent overwhelming server
    time.sleep(0.02)
    
except Exception as e:
    errors.append([movie_ids, e])

In [None]:
print(f"- Total errors: {len(errors)}")

In [None]:
import glob

tmdb_files = sorted(glob.glob("Data/final_tmdb_data*.csv.gz"))
tmdb_files

In [None]:
df = pd.concat([pd.read_csv(f) for f in tmdb_files] )
df