In [2]:
import requests
import shutil
import gzip
from datetime import date
import json
import getpass
from neo4j import GraphDatabase
import time

## Set up connection information

In [3]:
api_key = getpass.getpass()

 ································


In [4]:
neo4j_password = getpass.getpass()

 ····


In [5]:
neo4j_database_name = "movies"
neo4j_user = "tmdb"

In [6]:
driver = GraphDatabase.driver("neo4j://localhost:7687", auth=(neo4j_user, neo4j_password))

## Create Neo4j constraints

In [6]:
with driver.session(database = neo4j_database_name) as session:
    session.run("CREATE CONSTRAINT movie_tmdb_id_node_key IF NOT EXISTS FOR (m:Movie) REQUIRE m.tmdbId IS NODE KEY")
    session.run("CREATE CONSTRAINT genre_tmdb_id_node_key IF NOT EXISTS FOR (g:Genre) REQUIRE g.tmdbId IS NODE KEY")
    session.run("CREATE CONSTRAINT production_company_tmdb_id_node_key IF NOT EXISTS FOR (pc:ProductionCompany) REQUIRE pc.tmdbId IS NODE KEY")  
    session.run("CREATE CONSTRAINT person_tmdb_id_node_key IF NOT EXISTS FOR (p:Person) REQUIRE p.tmdbId IS NODE KEY")

## Import a movie with credits, translations and watch providers

In [7]:
def call_movie_api(movie_id, api_type):
    base_movie_url = "https://api.themoviedb.org/3/movie/"
    movie_id = str(movie_id)
    api_url = requests.compat.urljoin(base_movie_url, movie_id)
    if api_type != "movie":
        api_url = requests.compat.urljoin(api_url + '/', api_type)
    api_response = requests.get(api_url, params={'api_key':api_key, 'language':'en-us'})
    return json.loads(api_response.text)

In [8]:
def get_movie_json(movie_id):
    apis = ['movie', 'credits']
    json = {api: call_movie_api(movie_id, api) for api in apis}
    return json

In [9]:
def rename_keys(original_dict, rename_dict):
    for key, value in rename_dict.items():
        if original_dict.get(key):
            original_dict[value] = original_dict.pop(key)
    return original_dict

In [10]:
def add_movie_to_graph(movie_json):
    loaded = False
    if (movie_json['movie']['adult'] == False and movie_json['movie']['release_date'] > '1980-01-01' 
        and movie_json['movie']['original_language'] == 'en'
        and movie_json['movie']['status'] == 'Released') :
        property_keys = ['adult', 'budget', 'title', 'release_date', 
                'revenue', 'runtime', 'status', 'tagline', 'title', 'video', 'original_language']
        property_dict = {key:movie_json['movie'][key] for key in property_keys}
        rename_dict = {'release_date':'releaseDate', 'original_language':'originalLanguage'}
        property_dict = rename_keys(property_dict, rename_dict)
        property_dict['source'] = 'TMDB'
        genres = movie_json['movie'].get('genres')
        production_companies = movie_json['movie'].get('production_companies')
        tmdbId = int(movie_json['movie']['id'])
        with driver.session(database=neo4j_database_name) as session:
            session.run("""
            MERGE (m:Movie {tmdbId:$tmdbId}) SET m+=$props
            FOREACH(genre in $genres | MERGE (g:Genre {tmdbId:genre['id']}) 
            SET g.name = genre['name']
            MERGE (g)<-[:HAS_GENRE]-(m))
            FOREACH(comp in $productionCompanies | MERGE (pc:ProductionCompany {tmdbId:comp['id']})
            SET pc.name = comp['name'], 
            pc.originCountry = comp['origin_country'],
            pc.source = 'TMDB'
            MERGE (pc)-[:PRODUCED]->(m))""", 
                        tmdbId = tmdbId, 
                        props = property_dict, 
                        genres = genres,
                        productionCompanies = production_companies)
        loaded = True
    return loaded
        

In [11]:
def add_cast_to_graph(movie_json):
    tmdbId = int(movie_json['movie']['id'])
    cast_keys = ['id', 'gender', 'name', 'order', 'credit_id', 'character', 'cast_id']
    cast_json = movie_json['credits']['cast']
    cast_properties = [ {key:member_json[key] for key in cast_keys } for member_json in cast_json ]
    with driver.session(database=neo4j_database_name) as session:
        session.run("""MATCH (m:Movie {tmdbId:$tmdbId})
        UNWIND $castMembers as c
        MERGE (p:Person {tmdbId:c['id']})
        SET
        p.gender = c['gender'],
        p.name = c['name']
        MERGE (p)-[r:ACTED_IN]->(m)
        SET
        r.order = c['order'],
        r.creditId = c['credit_id'],
        r.character = c['character'],
        r.tmdbId = c['cast_id']""", 
                    tmdbId = tmdbId, 
                    castMembers = cast_properties)

In [12]:
def add_crew_to_graph(movie_json):
    tmdbId = int(movie_json['movie']['id'])
    crew_keys = ['id', 'gender', 'name', 'department', 'credit_id', 'job']
    crew_json = movie_json['credits']['crew']
    crew_properties = [ {key:member_json[key] for key in crew_keys } for member_json in crew_json ]
    with driver.session(database=neo4j_database_name) as session:
        session.run("""MATCH (m:Movie {tmdbId:$tmdbId})
        UNWIND $crewMembers as c
        MERGE (p:Person {tmdbId:c['id']})
        SET 
        p.gender = c['gender'],
        p.name = c['name']
        MERGE (p)-[r:WORKED_ON]->(m)
        SET 
        r.department = c['department'],
        r.creditId = c['credit_id'],
        r.job = c['job']""", 
                    tmdbId = tmdbId, 
                    crewMembers = crew_properties)

In [13]:
def process_movie(movie_id):
    movie_json = get_movie_json(movie_id)
    loaded = add_movie_to_graph(movie_json)
    if loaded:
        add_cast_to_graph(movie_json)
        add_crew_to_graph(movie_json)
    return loaded

In [14]:
date_string = date.today().strftime("%m_%d_%Y")
url = f"http://files.tmdb.org/p/exports/movie_ids_{date_string}.json.gz"

In [15]:
def download_file(url):
    local_filename = url.split('/')[-1]
    with requests.get(url, stream=True) as r:
        with open(local_filename, 'wb') as f:
            shutil.copyfileobj(r.raw, f)
    return local_filename

In [16]:
local_file = download_file(url)

In [17]:
def process_file(lines, start, end, prefix):
    loaded_count = 0
    line_count = start
    for line in lines[start:end]:
        line_json = json.loads(line)
        if line_json['original_title'][:len(prefix)] == prefix or prefix == "" and not line_json['adult']:
            try:
                loaded = process_movie(line_json['id'])
                if loaded == False:
                    pass
                    #print(f"   Did not load {line_json['original_title']}")
                else:
                    #print(f"Loaded film {line_json['original_title']} (id {line_json['id']})")
                    loaded_count = loaded_count + 1
                    if loaded_count % 100 == 0:
                        print(f"Loaded {loaded_count} more movies up to line {line_count}.")
                        print(f"Latest film was {line_json['original_title']} (id {line_json['id']})")
            except:
                print(f"Failed to process movie {line_json['id']}. Retrying.")
                time.sleep(3)
                try:
                    process_movie(line_json['id'])
                except:
                    pass 
        line_count = line_count + 1
    return loaded

In [18]:
with gzip.open(local_file, 'rb') as f:
    lines = f.readlines()
    print(len(lines))

754676


In [31]:
with gzip.open(local_file, 'rb') as f:
    lines = f.readlines()
    print(json.loads(lines[192000])['id'])

308584


In [32]:
with gzip.open(local_file, 'rb') as f:
    lines = f.readlines()
    process_file(lines, 192000, 754438, "")      

Loaded 100 more movies up to line 192233.
Latest film was People (id 308868)
Loaded 200 more movies up to line 192452.
Latest film was Grainger's World: Yindi: The Last Koala? (id 309135)
Loaded 300 more movies up to line 192613.
Latest film was Take Care (id 309313)
Loaded 400 more movies up to line 192850.
Latest film was Revenge in Olympia (id 309585)
Loaded 500 more movies up to line 193042.
Latest film was Ext. Life (id 309785)
Loaded 600 more movies up to line 193235.
Latest film was America's Greatest Battles (id 309994)
Loaded 700 more movies up to line 193409.
Latest film was Ahead: The Movie (id 310203)
Loaded 800 more movies up to line 193636.
Latest film was Vampz (id 310451)
Loaded 900 more movies up to line 193901.
Latest film was Once a Thief: Family Business (id 310740)
Loaded 1000 more movies up to line 194150.
Latest film was R.E.M.: In View 1988-2003 (The Best of R.E.M.) (id 311018)
Loaded 1100 more movies up to line 194471.
Latest film was The Best of Breckenridge S

In [None]:
with gzip.open(local_file, 'rb') as f:
    lines = f.readlines()
    process_file(lines, 30000, 1000000, "")      