In [1]:
import pandas as pd
import numpy as np
import tmdbsimple as tmdb
import json
import requests
import tqdm
import re
import multiprocessing

# Helper functions

In [2]:
def load_movie_details(df):
    for idx, row in df.iterrows():
        purl, gl = get_movie_data(row["id"])
        df.loc[idx, "poster_url"] = purl
        df.loc[idx, "genre_id"] = str(gl)
    return df


def get_movie_data(mid: int):
    m = tmdb.Movies(mid).info()

    # Get poster url
    if m['poster_path'] != None:
        p_url = f"{POSTER_URI}{tmdb.Movies(mid).info()['poster_path']}"
    else:
        p_url = None

    # Get genres
    if m['genres'] != None:
        gl = [gid["id"] for gid in m['genres']]
    else:
        gl = None

    return p_url, gl


def parallelize_dataframe(df, func):
    num_cores = multiprocessing.cpu_count()-1  #leave one free to not freeze machine
    num_partitions = num_cores #number of partitions to split dataframe
    df_split = np.array_split(df, num_partitions)
    pool = multiprocessing.Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

# Config

In [3]:
MOVIE_URI = "https://www.themoviedb.org/movie/"
#POSTER_URI = "https://www.themoviedb.org/t/p/w600_and_h900_bestv2/"
POSTER_URI = "https://www.themoviedb.org/t/p/w500/"
LOGO_URI = "https://www.themoviedb.org/t/p/w500/"

FILE_DIR = "./"
DATA_DIR = FILE_DIR + "data/"
POSTER_DIR = DATA_DIR + "posters/"

# Init

In [4]:
# Open TDB session
tmdb.API_KEY = '3df59d9cab79276062740d50a0fbe11a'
tmdb.REQUESTS_SESSION = requests.Session()

# Display options
pd.set_option('display.max_colwidth', None)

# Preprocessing

## Load all movies from file

In [5]:
# Load movies from file
movies = []
for line in open(DATA_DIR + 'tmdb_movie_ids_03_10_2021.json', 'r'):
    movies.append(json.loads(line))
df = pd.json_normalize(movies)

## Fetch additional movie information from TDB

In [6]:
# Add movie url
df["url"] = df["id"].map(lambda mid: f"{MOVIE_URI}{mid}")

In [None]:
# Add poster url and genres
split_size = 10000
count = 0
l_df = []
for i in tqdm.tqdm(range(0, len(df), split_size), total=len(df)//split_size+1):
    s = count*split_size
    e = ((count+1)*split_size) - 1
    if e > len(df):
        e = len(df) - 1

    df_tmp = df[s:e++1].copy()
    df_tmp = parallelize_dataframe(df_tmp, load_movie_details)
    l_df.append(df_tmp)
    df_tmp.to_parquet(DATA_DIR + f'df_{s}_{e}.parquet.gzip', compression='gzip')
    count += 1

# Put it together
df = pd.concat(l_df)
df.isna().sum()
df.head()

 42%|████▏     | 25/59 [1:20:38<1:55:00, 202.97s/it]

## Save movie dataframe to file

In [None]:
df.to_parquet(DATA_DIR + f'df.parquet.gzip', compression='gzip')

In [None]:
'''
%time
# Create poster url
df = parallelize_dataframe(df, load_movie_details)

display(df.isna().sum())
display(df.head())
'''

In [None]:
df.to_parquet(DATA_DIR + 'df.parquet.gzip', compression='gzip')

In [None]:
# Load Genres from db
mids = [75, 3924, 31975]
for mid in mids:
    mi = tmdb.Movies(mid).info()
    try:
        print(mi['id'])
        print(mi['poster_path'])
        print(",".join([str(gid["id"]) for gid in mi['genres']]))
        print([gid["id"] for gid in mi['genres']])
        display(mi)
    except Exception as e:
        pass


In [None]:
df

In [None]:
# Get all Generes for movies
d_genres = tmdb.Genres().movie_list()
display(d_genres['genres'])
genre_ids = [gid["id"] for gid in d_genres['genres']]

In [None]:
# Get all movies
tmdb.Discover().movie(with_genres=genre_ids)