In [1]:
import pandas as pd
import numpy as np
import tmdbsimple as tmdb
import json
import requests
import tqdm
import re
import multiprocessing
import sys

# Helper functions

In [2]:
def load_movie_details(df):
    for idx, row in df.iterrows():
        purl, gl = get_movie_data(row["id"])
        df.loc[idx, "poster_url"] = purl
        df.loc[idx, "genre_id"] = str(gl)
    return df


def get_movie_data(mid: int):
    try:
        m = tmdb.Movies(mid).info()
    except Exception as e:
        print(f"Error loading movie {mid}, Exception: {sys.exc_info()}")
        return None, None

    # Get poster url
    if m['poster_path'] != None:
        p_url = f"{POSTER_URI}{tmdb.Movies(mid).info()['poster_path']}"
    else:
        p_url = None

    # Get genres
    if m['genres'] != None:
        gl = [gid["id"] for gid in m['genres']]
    else:
        gl = None

    return p_url, gl


def parallelize_dataframe(df, func):
    num_cores = multiprocessing.cpu_count()-1  #leave one free to not freeze machine
    num_partitions = num_cores #number of partitions to split dataframe
    df_split = np.array_split(df, num_partitions)
    pool = multiprocessing.Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

# Config

In [3]:
MOVIE_URI = "https://www.themoviedb.org/movie/"
#POSTER_URI = "https://www.themoviedb.org/t/p/w600_and_h900_bestv2/"
POSTER_URI = "https://www.themoviedb.org/t/p/w500/"
LOGO_URI = "https://www.themoviedb.org/t/p/w500/"

FILE_DIR = "./"
DATA_DIR = FILE_DIR + "../data/raw/"
POSTER_DIR = DATA_DIR + "posters/"

# Init

In [4]:
# Open TDB session
tmdb.API_KEY = '3df59d9cab79276062740d50a0fbe11a'
tmdb.REQUESTS_SESSION = requests.Session()

# Display options
pd.set_option('display.max_colwidth', None)

# Preprocessing

## Load all movies from file

In [5]:
# Load movies from file
movies = []
for line in open(DATA_DIR + 'tmdb_movie_ids_03_10_2021.json', 'r'):
    movies.append(json.loads(line))
df = pd.json_normalize(movies)

## Fetch additional movie information from TDB

In [6]:
# Add movie url
df["url"] = df["id"].map(lambda mid: f"{MOVIE_URI}{mid}")

In [8]:
# Add poster url and genres
l_df = []
split_size = 10000
start_count = 30
end_count = (len(df) // split_size) + 1

for i in tqdm.tqdm(range(start_count, end_count)):
    s = i * split_size
    e = ((i + 1) * split_size) - 1
    if e > len(df):
        e = len(df) - 1

    df_tmp = df[s:e++1].copy()
    df_tmp = parallelize_dataframe(df_tmp, load_movie_details)
    l_df.append(df_tmp)
    df_tmp.to_parquet(DATA_DIR + f'df_{s}_{e}.parquet.gzip', compression='gzip')

# Put it together
df = pd.concat(l_df)
df.isna().sum()
df.head()

  0%|          | 0/29 [00:00<?, ?it/s]Process ForkPoolWorker-2:
Process ForkPoolWorker-7:
Process ForkPoolWorker-6:
Process ForkPoolWorker-5:
Process ForkPoolWorker-4:
Process ForkPoolWorker-3:
Process ForkPoolWorker-1:
Process ForkPoolWorker-8:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/ths/miniconda3/envs/aida/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/ths/miniconda3/envs/aida/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/ths/miniconda3/envs/aida/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()


KeyboardInterrupt: 

  File "/home/ths/miniconda3/envs/aida/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/home/ths/miniconda3/envs/aida/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/home/ths/miniconda3/envs/aida/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/ths/miniconda3/envs/aida/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/home/ths/miniconda3/envs/aida/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/home/ths/miniconda3/envs/aida/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/ths/miniconda3/envs/aida/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
Traceback (most recent cal

## Save movie files to dataframe to file

In [None]:
df.to_parquet(DATA_DIR + f'df.parquet.gzip', compression='gzip')

In [None]:
'''
%time
# Create poster url
df = parallelize_dataframe(df, load_movie_details)

display(df.isna().sum())
display(df.head())
'''

In [None]:
df.to_parquet(DATA_DIR + 'df.parquet.gzip', compression='gzip')

In [22]:
# Load Genres from db
mids = [75, 3924, 31975]
for mid in mids:
    mi = tmdb.Movies(mid).info()
    try:
        print(mi['id'])
        print(mi['poster_path'])
        print(",".join([str(gid["id"]) for gid in mi['genres']]))
        print([gid["id"] for gid in mi['genres']])
        display(mi)
    except Exception as e:
        pass


75
/hll4O5vSAfnZDb6JbnP06GPtz7b.jpg
35,14,878
[35, 14, 878]


{'adult': False,
 'backdrop_path': '/ELsTifJ2lu4vsMhoHeZ5EnncHw.jpg',
 'belongs_to_collection': None,
 'budget': 70000000,
 'genres': [{'id': 35, 'name': 'Comedy'},
  {'id': 14, 'name': 'Fantasy'},
  {'id': 878, 'name': 'Science Fiction'}],
 'homepage': 'https://www.warnerbros.com/movies/mars-attacks/',
 'id': 75,
 'imdb_id': 'tt0116996',
 'original_language': 'en',
 'original_title': 'Mars Attacks!',
 'overview': "'We come in peace' is not what those green men from Mars mean when they invade our planet, armed with irresistible weapons and a cruel sense of humor.  This star studded cast must play victim to the alien’s fun and games in this comedy homage to science fiction films of the '50s and '60s.",
 'popularity': 13.994,
 'poster_path': '/hll4O5vSAfnZDb6JbnP06GPtz7b.jpg',
 'production_companies': [{'id': 8601,
   'logo_path': None,
   'name': 'Tim Burton Productions',
   'origin_country': ''},
  {'id': 174,
   'logo_path': '/ky0xOc5OrhzkZ1N6KyUxacfQsCk.png',
   'name': 'Warner Bros.

3924
/o6UMTE2LzQdlKVxRnFECPmtQjsJ.jpg
35
[35]


{'adult': False,
 'backdrop_path': '/dvQj1GBZAZirz1skEEZyWH2ZqQP.jpg',
 'belongs_to_collection': {'id': 177062,
  'name': 'Blondie Collection',
  'poster_path': '/sjF7S5JK4FKr6mz8tt70LUen1hF.jpg',
  'backdrop_path': '/qMr0HqAhfYmIkSVs6W5xuxwEIbR.jpg'},
 'budget': 0,
 'genres': [{'id': 35, 'name': 'Comedy'}],
 'homepage': '',
 'id': 3924,
 'imdb_id': 'tt0029927',
 'original_language': 'en',
 'original_title': 'Blondie',
 'overview': 'Blondie and Dagwood are about to celebrate their fifth wedding anniversary but this happy occasion is marred when the bumbling Dagwood gets himself involved in a scheme that is promising financial ruin for the Bumstead family.',
 'popularity': 3.855,
 'poster_path': '/o6UMTE2LzQdlKVxRnFECPmtQjsJ.jpg',
 'production_companies': [{'id': 5,
   'logo_path': '/71BqEFAF4V3qjjMPCpLuyJFB9A.png',
   'name': 'Columbia Pictures',
   'origin_country': 'US'}],
 'production_countries': [{'iso_3166_1': 'US',
   'name': 'United States of America'}],
 'release_date': '1938-1

31975
None
10751
[10751]


{'adult': False,
 'backdrop_path': None,
 'belongs_to_collection': None,
 'budget': 0,
 'genres': [{'id': 10751, 'name': 'Family'}],
 'homepage': '',
 'id': 31975,
 'imdb_id': 'tt1656746',
 'original_language': 'en',
 'original_title': 'Sesame Street: Elmo Loves You!',
 'overview': 'Elmo is making a very, very super special surprise card for someone Elmo really loves but a gust of wind blows it away! Rosita joins Elmo to chase after the card but they can\'t reach it. Can Super Grover use his heroic skills to save the day? Will Elmo be able to give his surprise card to someone very special? Join Elmo, Rosita and Grover as they learn about love, emotions and friendship in this love-ly tale featuring the new song "Elmo Loves You" and a special music video about signing "I Love You".',
 'popularity': 1.699,
 'poster_path': None,
 'production_companies': [],
 'production_countries': [],
 'release_date': '2010-01-05',
 'revenue': 0,
 'runtime': 46,
 'spoken_languages': [],
 'status': 'Releas

In [None]:
df

In [None]:
# Get all Generes for movies
d_genres = tmdb.Genres().movie_list()
display(d_genres['genres'])
genre_ids = [gid["id"] for gid in d_genres['genres']]

In [None]:
# Get all movies
tmdb.Discover().movie(with_genres=genre_ids)