In [1]:
import pandas as pd
import numpy as np

In [2]:
# title.basics.tsv.gz url
basics_url="https://datasets.imdbws.com/title.basics.tsv.gz"
# load basic URL
basics = pd.read_csv(basics_url, sep='\t', low_memory = False)

KeyboardInterrupt: 

In [None]:
basics.head()

In [None]:
# title.ratings.tsv.gz
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"
# Load raitings URL
ratings = pd.read_csv(ratings_url, sep='\t', low_memory = False)

In [None]:
ratings.head()

In [None]:
# title.akas.tsv.gz
akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"
# Load akas URL
akas = pd.read_csv(akas_url, sep='\t', low_memory = False)

In [None]:
akas.head()

## Replacing \N with nan

In [None]:
# Replacing \N in basics
basics.replace({"\\N":np.nan}, inplace = True)

In [None]:
# Confirm Replacement 
basics.head()

In [None]:
# Replace \N in ratings
ratings.replace({"\\N":np.nan}, inplace = True)

In [None]:
#Replace \N in akas
akas.replace({"\\N":np.nan}, inplace = True)

In [None]:
# Confirm replacement 
akas.head()

## Basics Processing

In [None]:
basics.head()

In [None]:
# Eliminate movies that are null for runtimeMinutes 
basics.dropna(subset = ["runtimeMinutes"], inplace = True)

In [None]:
# Confirm drop
basics["runtimeMinutes"].isnull().sum()

In [None]:
# Eliminate movies that are null for genre
basics.dropna(subset = ["genres"], inplace = True)

In [None]:
# Confirm Drop 
basics["genres"].isnull().sum()

In [None]:
basics["titleType"].value_counts()

In [None]:
# keep only titleType==Movie
basics = basics.loc[(basics["titleType"]=="movie")]

In [None]:
# Confirm 
basics.head()

In [None]:
# keep startYear 2000-2022
basics = basics.loc[(basics["startYear"]>"1999")&(basics["startYear"]<"2022")]

In [None]:
# Confirm
basics.head()

In [None]:
# Eliminate movies that include  "Documentary" in genre (see tip below)
is_doc = basics["genres"].str.contains("documentary", case = False)
basics = basics[~is_doc]

In [None]:
#Confirm
basics.head()

## AKAS Processing

In [None]:
akas.head()

In [None]:
# keep only US entries
akas = akas.loc[(akas["region"]== "US")]

In [None]:
# Confirm
akas.head()

## Filtering dataframes

In [None]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers = basics["tconst"].isin(akas["titleId"])
keepers

In [None]:
# filter basics
basics = basics[keepers]
basics

In [None]:
basics.info()

In [None]:
ratings.info()

In [None]:
akas.info()

In [None]:
# # example making new folder with os
import os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")

In [None]:
# Save basics
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

In [None]:
# Confirm
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
basics.head()

In [None]:
# Save ratings
ratings.to_csv("Data/title_ratings.csv.gz", compression = "gzip", index = False)

In [None]:
# confirm 
ratings = pd.read_csv("Data/title_ratings.csv.gz", low_memory = False)
ratings.head()

In [None]:
# Save akas 
akas.to_csv("Data/title_akas.csv.gz", compression = "gzip", index = False)

In [None]:
# confrim 
akas = pd.read_csv("Data/title_akas.csv.gz", low_memory = False)
akas.head()

## Project 3 - Part 2

In [None]:
# Install tmbdsimple 
!pip install tmdbsimple

In [None]:
import json
with open("/Users/samlagana/.secret/tmdb_api.json", "r") as f:
    login = json.load(f)
# Display keys of dict
login.keys()

In [None]:
import tmdbsimple as tmdb
tmdb.API_KEY = login["api-key"]

In [None]:
# Make a movie object using the .Movies function from tmdb
movie = tmdb.Movies(603)

# Movie objects .info dict
info = movie.info()
info

In [None]:
# Movie with rating function 
def get_movie_with_rating(movie_id):
    # Get the movie object for current id
    movie = tmdb.Movies(movie_id)
    # save the .info .releases dict
    movie_info = movie.info()
    releases = movie.releases()
    # Loop through countries in releases
    for c in releases["countries"]:
        # if the country is == US
        if c["iso_3166_1"]== "US":
            movie_info["certification"]= c["certification"]
    return movie_info

In [None]:
get_movie_with_rating("603")

In [None]:
import os 
from tqdm.notebook import tqdm_notebook
FOLDER = "Data/"
os.makedirs(FOLDER, exist_ok = True)
os.listdir(FOLDER)

In [None]:
def write_json(new_data, filename):
    """"Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""
    
    with open(filename, "r+") as file:
        # First we load existing data into dict
        file_data = json.load(file)
        #chose extend or append
        if (type(new_data)==list) & (type(file_data)== list):
            file_data.extend(new_data)
        else:
            file_data.append(new_data)
        # Sets file's current position as offset
        file.seek(0)
        # convert back to json 
        json.dump(file_data, file)

In [None]:
# Load df from project part 1 as basics
basics = pd.read_csv("Data/title_basics.csv.gz")
basics.head()

In [None]:
YEARS_TO_GET = [2000,2001]

In [None]:
YEAR = YEARS_TO_GET[0]
YEAR

In [None]:
# Start of outer loop 
for YEAR in tqdm_notebook(YEARS_TO_GET, desc = "YEARS",
                         position = 0):

    # Defining JSON file to store results for year
    JSON_FILE = f"{FOLDER}tmdb_api_results_{YEAR}.json"
    # Check if file exists
    file_exists = os.path.isfile(JSON_FILE)


    # if it does not exist
    if file_exists == False:
        # save an empty dict with just "imdb_id" to the new json file
        with open(JSON_FILE,"w") as f:
            json.dump([{"imdb_id":0}],f)

    #Saving new year as the current df
    df = basics.loc[basics["startYear"]==YEAR].copy()
    #saving movide ids to list
    movie_ids= df["tconst"].copy()#.to_list()
    movie_ids

    # Load in existing data from json info df called "previous_df"
    previous_df = pd.read_json(JSON_FILE)
    previous_df

    # filter out any ids that are already in the JSON FILE
    movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df["imdb_id"])]

    # Get index and movie id from list
    # INNER Loop 
    for movie_id in tqdm_notebook(movie_ids_to_get,
                                 desc= f"Movies from{YEAR}",
                                 position = 1,
                                 leave = True):
        # Attempt to retrieve the data for the movie id 
        try: 
            temp = get_movie_with_rating(movie_id) # uses pre-made function
                #Append/Extend results to exisiting file using premade funct
            write_json(temp,JSON_FILE)
            # Short 20 ms sleep to prevent overwhelming server
            time.sleep(0.02)
        # if it fails, make a dict with just id and none for certification
        except Exception as e:
            continue
   
    # Saving final df
    final_year_df = pd.read_json(JSON_FILE)
    final_year_df.to_csv(f"{FOLDER}final_tmdb_data{YEAR}.csv.gz",compression = "gzip", index = False)