In [4]:
import pandas as pd
import numpy as np
import tmdbsimple as tmdb
import json
import requests
import tqdm
import re
import multiprocessing
import glob
import fastparquet as fp
import sys
import src.helper.helper as hlp

ModuleNotFoundError: No module named 'src'

# Helper functions

In [2]:
def load_movie_details(df):
    for idx, row in df.iterrows():
        purl, gl = get_movie_data(row["id"])
        df.loc[idx, "poster_url"] = purl
        df.loc[idx, "genre_id"] = str(gl)
    return df


def get_movie_data(mid: int):
    try:
        m = tmdb.Movies(mid).info()
    except Exception as e:
        print(f"Error loading movie {mid}, Exception: {sys.exc_info()}")
        return None, None

    # Get poster url
    if m['poster_path'] != None:
        p_url = f"{POSTER_URI}{tmdb.Movies(mid).info()['poster_path']}"
    else:
        p_url = None

    # Get genres
    if m['genres'] != None:
        gl = [gid["id"] for gid in m['genres']]
    else:
        gl = None

    return p_url, gl


def parallelize_dataframe(df, func):
    num_cores = multiprocessing.cpu_count()-1  #leave one free to not freeze machine
    num_partitions = num_cores #number of partitions to split dataframe
    df_split = np.array_split(df, num_partitions)
    pool = multiprocessing.Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

# Config

In [3]:
MOVIE_URI = "https://www.themoviedb.org/movie/"
#POSTER_URI = "https://www.themoviedb.org/t/p/w600_and_h900_bestv2/"
POSTER_URI = "https://www.themoviedb.org/t/p/w500/"
LOGO_URI = "https://www.themoviedb.org/t/p/w500/"

FILE_DIR = "./"
DATA_DIR = FILE_DIR + "../data/raw/"
POSTER_DIR = DATA_DIR + "posters/"

# Init

In [4]:
# Open TDB session
tmdb.API_KEY = '3df59d9cab79276062740d50a0fbe11a'
tmdb.REQUESTS_SESSION = requests.Session()

# Display options
pd.set_option('display.max_colwidth', None)

# Preprocessing

## Load all movies from file

In [5]:
# Load movies from file
movies = []
for line in open(DATA_DIR + 'tmdb_movie_ids_03_10_2021.json', 'r'):
    movies.append(json.loads(line))
df = pd.json_normalize(movies)

## Fetch additional movie information from TDB

In [6]:
# Add movie url
df["url"] = df["id"].map(lambda mid: f"{MOVIE_URI}{mid}")

In [7]:
# Add poster url and genres
l_df = []
split_size = 10000
start_count = 45
end_count = (len(df) // split_size) + 1

for i in tqdm.tqdm(range(start_count, end_count)):
    s = i * split_size
    e = ((i + 1) * split_size) - 1
    if e > len(df):
        e = len(df) - 1

    df_tmp = df[s:e++1].copy()
    df_tmp = parallelize_dataframe(df_tmp, load_movie_details)
    l_df.append(df_tmp)
    df_tmp.to_parquet(DATA_DIR + f'df_{s}_{e}.parquet.gzip', compression='gzip')

# Put it together
df = pd.concat(l_df)
df.isna().sum()
df.head()

  0%|          | 0/14 [00:00<?, ?it/s]

Error loading movie 631286, Exception: (<class 'requests.exceptions.HTTPError'>, HTTPError('404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/631286?api_key=3df59d9cab79276062740d50a0fbe11a'), <traceback object at 0x7f511c796cd0>)


  7%|▋         | 1/14 [02:08<27:51, 128.57s/it]

Error loading movie 653930, Exception: (<class 'requests.exceptions.HTTPError'>, HTTPError('404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/653930?api_key=3df59d9cab79276062740d50a0fbe11a'), <traceback object at 0x7f51110cb280>)
Error loading movie 646113, Exception: (<class 'requests.exceptions.HTTPError'>, HTTPError('404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/646113?api_key=3df59d9cab79276062740d50a0fbe11a'), <traceback object at 0x7f51110efaf0>)


 29%|██▊       | 4/14 [12:41<30:14, 181.49s/it]

Error loading movie 686849, Exception: (<class 'requests.exceptions.HTTPError'>, HTTPError('404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/686849?api_key=3df59d9cab79276062740d50a0fbe11a'), <traceback object at 0x7f5053143e60>)


 43%|████▎     | 6/14 [19:36<25:55, 194.38s/it]

Error loading movie 716622, Exception: (<class 'requests.exceptions.HTTPError'>, HTTPError('404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/716622?api_key=3df59d9cab79276062740d50a0fbe11a'), <traceback object at 0x7f5052715f50>)
Error loading movie 716643, Exception: (<class 'requests.exceptions.HTTPError'>, HTTPError('404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/716643?api_key=3df59d9cab79276062740d50a0fbe11a'), <traceback object at 0x7f5052718f50>)
Error loading movie 716645, Exception: (<class 'requests.exceptions.HTTPError'>, HTTPError('404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/716645?api_key=3df59d9cab79276062740d50a0fbe11a'), <traceback object at 0x7f50527176e0>)
Error loading movie 716647, Exception: (<class 'requests.exceptions.HTTPError'>, HTTPError('404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/716647?api_key=3df59d9cab79276062740d50a0fbe11a'), <traceback object at 0x7f5

 50%|█████     | 7/14 [23:06<23:14, 199.24s/it]

Error loading movie 721101, Exception: (<class 'requests.exceptions.HTTPError'>, HTTPError('404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/721101?api_key=3df59d9cab79276062740d50a0fbe11a'), <traceback object at 0x7f5051b8b780>)


 64%|██████▍   | 9/14 [29:52<16:44, 200.95s/it]

Error loading movie 750024, Exception: (<class 'requests.exceptions.HTTPError'>, HTTPError('404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/750024?api_key=3df59d9cab79276062740d50a0fbe11a'), <traceback object at 0x7f505147e4b0>)


 71%|███████▏  | 10/14 [33:09<13:18, 199.64s/it]

Error loading movie 758843, Exception: (<class 'requests.exceptions.HTTPError'>, HTTPError('404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/758843?api_key=3df59d9cab79276062740d50a0fbe11a'), <traceback object at 0x7f50523be730>)


 79%|███████▊  | 11/14 [36:24<09:54, 198.32s/it]

Error loading movie 776424, Exception: (<class 'requests.exceptions.HTTPError'>, HTTPError('404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/776424?api_key=3df59d9cab79276062740d50a0fbe11a'), <traceback object at 0x7f505200a960>)


 86%|████████▌ | 12/14 [39:39<06:34, 197.31s/it]

Error loading movie 792737, Exception: (<class 'requests.exceptions.HTTPError'>, HTTPError('404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/792737?api_key=3df59d9cab79276062740d50a0fbe11a'), <traceback object at 0x7f5051ccd140>)
Error loading movie 784233, Exception: (<class 'requests.exceptions.HTTPError'>, HTTPError('404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/784233?api_key=3df59d9cab79276062740d50a0fbe11a'), <traceback object at 0x7f5051cce690>)
Error loading movie 784244, Exception: (<class 'requests.exceptions.HTTPError'>, HTTPError('404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/784244?api_key=3df59d9cab79276062740d50a0fbe11a'), <traceback object at 0x7f5051cce7d0>)
Error loading movie 788856, Exception: (<class 'requests.exceptions.HTTPError'>, HTTPError('404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/788856?api_key=3df59d9cab79276062740d50a0fbe11a'), <traceback object at 0x7f5

 93%|█████████▎| 13/14 [42:51<03:15, 195.80s/it]

Error loading movie 794749, Exception: (<class 'requests.exceptions.HTTPError'>, HTTPError('404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/794749?api_key=3df59d9cab79276062740d50a0fbe11a'), <traceback object at 0x7f5050fd6dc0>)
Error loading movie 799120, Exception: (<class 'requests.exceptions.HTTPError'>, HTTPError('404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/799120?api_key=3df59d9cab79276062740d50a0fbe11a'), <traceback object at 0x7f5050fcb410>)
Error loading movie 803541, Exception: (<class 'requests.exceptions.HTTPError'>, HTTPError('404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/803541?api_key=3df59d9cab79276062740d50a0fbe11a'), <traceback object at 0x7f5050f4f140>)
Error loading movie 803560, Exception: (<class 'requests.exceptions.HTTPError'>, HTTPError('404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/803560?api_key=3df59d9cab79276062740d50a0fbe11a'), <traceback object at 0x7f5

100%|██████████| 14/14 [45:22<00:00, 194.47s/it]


Unnamed: 0,adult,id,original_title,popularity,video,url,poster_url,genre_id
450000,False,629166,Puștiul,0.6,False,https://www.themoviedb.org/movie/629166,https://www.themoviedb.org/t/p/w500//dNAgdPkfEqrjNtb8WtenklP129Z.jpg,[18]
450001,False,629167,The Disconnect,0.6,False,https://www.themoviedb.org/movie/629167,,[]
450002,False,629169,Podkova,0.6,False,https://www.themoviedb.org/movie/629169,,[35]
450003,False,629170,An Act of Love,0.6,False,https://www.themoviedb.org/movie/629170,,[]
450004,False,629171,Konec milování,0.6,False,https://www.themoviedb.org/movie/629171,,[18]


## Merge files to one file

In [8]:
# Load all existing files and merge them
l_df = []
l_file = glob.glob(DATA_DIR + "df*.parquet.gzip")
for file_path in l_file:
    pf = fp.ParquetFile(file_path)
    l_df.append(pf.to_pandas())
df = pd.concat(l_df)

# Save into one fle
df.to_parquet(DATA_DIR + f'df.parquet.gzip', compression='gzip')

In [13]:
''' OLD CODE
# Load some moviees from db
mids = [75, 3924, 31975]
for mid in mids:
    mi = tmdb.Movies(mid).info()
    try:
        print(mi['id'])
        print(mi['poster_path'])
        print(",".join([str(gid["id"]) for gid in mi['genres']]))
        print([gid["id"] for gid in mi['genres']])
        display(mi)
    except Exception as e:
        pass

# Get all Generes for movies
d_genres = tmdb.Genres().movie_list()
display(d_genres['genres'])
genre_ids = [gid["id"] for gid in d_genres['genres']]

# Get all movies
tmdb.Discover().movie(with_genres=genre_ids)
'''

' OLD CODE\n# Load some moviees from db\nmids = [75, 3924, 31975]\nfor mid in mids:\n    mi = tmdb.Movies(mid).info()\n    try:\n        print(mi[\'id\'])\n        print(mi[\'poster_path\'])\n        print(",".join([str(gid["id"]) for gid in mi[\'genres\']]))\n        print([gid["id"] for gid in mi[\'genres\']])\n        display(mi)\n    except Exception as e:\n        pass\n\n# Get all Generes for movies\nd_genres = tmdb.Genres().movie_list()\ndisplay(d_genres[\'genres\'])\ngenre_ids = [gid["id"] for gid in d_genres[\'genres\']]\n\n# Get all movies\ntmdb.Discover().movie(with_genres=genre_ids)\n'