# Source of Data
The Movie Database (TMDB): https://www.themoviedb.org/

This notebook is created to extract tmdb data fromm 2002 to 2022 

# Import Libraries

In [1]:
import pandas as pd
import os, time,json
import tmdbsimple as tmdb 
from tqdm.notebook import tqdm_notebook
import seaborn as sns
import matplotlib.pyplot as plt

# API Credentials

In [2]:
with open('C://Users//sheha//.secret/tmdb_api.json', 'r') as f:
    login = json.load(f)
    
## Display the keys of the loaded dict
login.keys()

dict_keys(['api-key'])

In [3]:
tmdb.API_KEY =  login['api-key']

# Designate a folder

In [4]:
FOLDER = "Data/"
os.makedirs(FOLDER, exist_ok= True)
os.listdir(FOLDER)

['.ipynb_checkpoints',
 'final_tmdb_data_2000.csv.gz',
 'final_tmdb_data_2001.csv.gz',
 'final_tmdb_data_2002.csv.gz',
 'final_tmdb_data_2003.csv.gz',
 'final_tmdb_data_2004.csv.gz',
 'final_tmdb_data_2005.csv.gz',
 'final_tmdb_data_2006.csv.gz',
 'final_tmdb_data_2007.csv.gz',
 'final_tmdb_data_2008.csv.gz',
 'final_tmdb_data_2009.csv.gz',
 'final_tmdb_data_2010.csv.gz',
 'final_tmdb_data_2011.csv.gz',
 'final_tmdb_data_2012.csv.gz',
 'final_tmdb_data_2013.csv.gz',
 'final_tmdb_data_2014.csv.gz',
 'final_tmdb_data_2015.csv.gz',
 'final_tmdb_data_2016.csv.gz',
 'final_tmdb_data_2017.csv.gz',
 'final_tmdb_data_2018.csv.gz',
 'final_tmdb_data_2019.csv.gz',
 'final_tmdb_data_2020.csv.gz',
 'final_tmdb_data_2021.csv.gz',
 'final_tmdb_data_2022.csv.gz',
 'final_tmdb_data_range(2002, 2023).csv.gz',
 'title_akas_chunk_001.csv.gz',
 'title_akas_chunk_002.csv.gz',
 'title_akas_chunk_003.csv.gz',
 'title_akas_chunk_004.csv.gz',
 'title_akas_chunk_005.csv.gz',
 'title_akas_chunk_006.csv.gz',
 'ti

# Define Functions

## Movie Rating Function

In [5]:
def get_movie_with_rating(movie_id):
    """Adapted  from source =  https://github.com/celiao/tmdbsimple"""
    # Get the object for the current id
    movie = tmdb.Movies(movie_id)
    
    # save the .info .releases dictionaries 
    info = movie.info()
    
    releases = movie.releases()
    # loop through countries in releases
    for c in releases['countries']:
        if c['iso_3166_1'] == 'US':
            ## save a "certification" key in the info dict with the certification
            info['certification'] = c['certification']
    return info

## Append records to json file

In [6]:
def write_json(new_data, filename): 
    """Appends a list of records (new_data) to a json file (filename). 
    Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""  
    
    with open(filename, 'r+') as file:
        #First we load existing data into a dict.
        file_data = json.load(file)
        ## use extend and or append
        if (type(new_data)==list) & (type(file_data)==list):
            file_data.extend(new_data)
        else:
            file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        #convert back to json
        json.dump(file_data, file)

# Load in the Title Basics data

In [7]:
basics = pd.read_csv("Data/title_basics.csv.gz")

# Create Required Lists for the Loop

In [13]:
# Define a list of the Years to Extract from the API
YEARS_TO_GET = [*range(2002, 2023,1)]
YEARS_TO_GET 

[2002,
 2003,
 2004,
 2005,
 2006,
 2007,
 2008,
 2009,
 2010,
 2011,
 2012,
 2013,
 2014,
 2015,
 2016,
 2017,
 2018,
 2019,
 2020,
 2021,
 2022]

In [9]:
# define error list
errors = [ ]

# Testing the "get_movie_with_rating" function

In [10]:
test_ids = ["tt0848228", "tt0332280"]
results = []

for movie_id in test_ids:
    
    try:
        movie_info = get_movie_with_rating(movie_id)
        results.append(movie_info)
        
    except Exception as e:
        errors.append([movie_id, e])
    
    
pd.DataFrame(results)

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,False,/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg,"{'id': 86311, 'name': 'The Avengers Collection...",220000000,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",https://www.marvel.com/movies/the-avengers,24428,tt0848228,en,The Avengers,...,1518815515,143,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Some assembly required.,The Avengers,False,7.708,28053,PG-13
1,False,/qom1SZSENdmHFNZBXbtJAU0WTlC.jpg,,29000000,"[{'id': 10749, 'name': 'Romance'}, {'id': 18, ...",http://www.newline.com/properties/notebookthe....,11036,tt0332280,en,The Notebook,...,115603229,123,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Behind every great love is a great story.,The Notebook,False,7.879,10065,PG-13


In [11]:
print(f"- Number of errors: {len(errors)}")
errors

- Number of errors: 0


[]

# Set up Progress Bar

In [12]:
# OUTER loop
for YEAR in tqdm_notebook(YEARS_TO_GET, desc='YEARS', position=0):
    #Defining the JSON file to store results for year
    JSON_FILE = f'{FOLDER}tmdb_api_results_{YEAR}.json'
    #check if file exists
    file_exists = os.path.isfile(JSON_FILE)
    # If it does not exist: create it
    if file_exists == False:
        # save an empty dict with just "imdb_id" to the new json file.
        with open(JSON_FILE,'w') as f:
            json.dump([{'imdb_id':0}],f)
    
    #Saving new year as the current df
    df = basics.loc[ basics['startYear']==YEAR].copy()
    
    # saving movie ids to list
    movie_ids = df['tconst'].copy()
    
    # Load existing data from json into a dataframe called "previous_df"
    previous_df = pd.read_json(JSON_FILE)
    
    # filter out any ids that are already in the JSON_FILE
    movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]

# INNER Loop
    for movie_id in tqdm_notebook(movie_ids_to_get,
                                  desc=f'Movies from {YEAR}',
                                  position=1,
                                  leave=True):
        try:
            # Retrieve then data for the movie id
            temp = get_movie_with_rating(movie_id)  
            # Append/extend results to existing file using a pre-made function
            write_json(temp,JSON_FILE)
            # Short 20 ms sleep to prevent overwhelming server
            time.sleep(0.02)
            
        except Exception as e:
            errors.append([movie_id, e])

    
    final_year_df = pd.read_json(JSON_FILE)
    final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz",
                         compression="gzip", index=False)



YEARS:   0%|          | 0/21 [00:00<?, ?it/s]

Movies from 2002:   0%|          | 0/1530 [00:00<?, ?it/s]

Movies from 2003:   0%|          | 0/1644 [00:00<?, ?it/s]

Movies from 2004:   0%|          | 0/1859 [00:00<?, ?it/s]

Movies from 2005:   0%|          | 0/2152 [00:00<?, ?it/s]

Movies from 2006:   0%|          | 0/2367 [00:00<?, ?it/s]

Movies from 2007:   0%|          | 0/2518 [00:00<?, ?it/s]

Movies from 2008:   0%|          | 0/2857 [00:00<?, ?it/s]

Movies from 2009:   0%|          | 0/3491 [00:00<?, ?it/s]

Movies from 2010:   0%|          | 0/3802 [00:00<?, ?it/s]

Movies from 2011:   0%|          | 0/4174 [00:00<?, ?it/s]

Movies from 2012:   0%|          | 0/4463 [00:00<?, ?it/s]

Movies from 2013:   0%|          | 0/4665 [00:00<?, ?it/s]

Movies from 2014:   0%|          | 0/4844 [00:00<?, ?it/s]

Movies from 2015:   0%|          | 0/4973 [00:00<?, ?it/s]

Movies from 2016:   0%|          | 0/5179 [00:00<?, ?it/s]

Movies from 2017:   0%|          | 0/5556 [00:00<?, ?it/s]

Movies from 2018:   0%|          | 0/5671 [00:00<?, ?it/s]

Movies from 2019:   0%|          | 0/5759 [00:00<?, ?it/s]

Movies from 2020:   0%|          | 0/4906 [00:00<?, ?it/s]

Movies from 2021:   0%|          | 0/4979 [00:00<?, ?it/s]

Movies from 2022:   0%|          | 0/4159 [00:00<?, ?it/s]

In [14]:
tmdb_data_all = pd.read_csv("Data/tmdb_results_combined.csv.gz")
tmdb_data_all = tmdb_data_all.loc[1:]
tmdb_data_all.head(2)

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
1,tt0113026,0.0,/vMFs7nw6P0bIV1jDsQpxAieAVnH.jpg,,10000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...",,62127.0,en,The Fantasticks,...,0.0,86.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Try to remember the first time magic happened,The Fantasticks,0.0,5.5,22.0,
2,tt0113092,0.0,,,0.0,"[{'id': 878, 'name': 'Science Fiction'}]",,110977.0,en,For the Cause,...,0.0,100.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,The ultimate showdown on a forbidden planet.,For the Cause,0.0,5.1,8.0,


In [15]:
len(tmdb_data_all)

2508

In [16]:
for i in YEARS_TO_GET:
    df = pd.read_csv(f"Data/final_tmdb_data_{i}.csv.gz")
    df = df.loc[1:]
    tmdb_data_all = pd.concat([tmdb_data_all, df], ignore_index= True)
    print(i, len(df) ,len(tmdb_data_all))

2002 1248 3756
2003 1287 5043
2004 1440 6483
2005 1567 8050
2006 235 8285
2007 1883 10168
2008 2102 12270
2009 2401 14671
2010 583 15254
2011 1884 17138
2012 3248 20386
2013 2664 23050
2014 3692 26742
2015 3744 30486
2016 3996 34482
2017 4368 38850
2018 4078 42928
2019 3863 46791
2020 3872 50663
2021 2519 53182
2022 2973 56155


In [31]:
display (tmdb_data_all.head(2), tmdb_data_all.tail(2))

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,tt0113026,0.0,/vMFs7nw6P0bIV1jDsQpxAieAVnH.jpg,,10000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...",,62127.0,en,The Fantasticks,...,0.0,86.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Try to remember the first time magic happened,The Fantasticks,0.0,5.5,22.0,
1,tt0113092,0.0,,,0.0,"[{'id': 878, 'name': 'Science Fiction'}]",,110977.0,en,For the Cause,...,0.0,100.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,The ultimate showdown on a forbidden planet.,For the Cause,0.0,5.1,8.0,


Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
56153,tt9893160,0.0,/jX5XGqJUTzvpta2RjcX6pMZqxk5.jpg,,0.0,"[{'id': 53, 'name': 'Thriller'}, {'id': 80, 'n...",,606303.0,en,No Way Out,...,0.0,89.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Find what you love and let it kill you.,No Way Out,0.0,4.944,9.0,
56154,tt9904648,0.0,/ibWj2ARVrAabIWDq5vSXoJRZfVV.jpg,,0.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",https://contrastmovie.com/,933557.0,en,The Contrast,...,0.0,82.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Everybody loves... somebody else.,The Contrast,0.0,3.0,1.0,


In [32]:
# Save a final merged .csv.gz of all of the tmdb api data 
tmdb_data_all.to_csv("Data/tmdb_data_all.csv.gz", 
                                compression="gzip", 
                                index=False)