# Add movie release date to existing dataframe

In [1]:
import pandas as pd
import numpy as np
import tmdbsimple as tmdb
import json
import requests
import tqdm
import re
#import multiprocessing
import ray
import glob
import fastparquet as fp
import sys
import src.helper.helper as hlp
import src.helper.secret as sec

In /home/ths/miniconda3/envs/aida/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: 
The text.latex.preview rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In /home/ths/miniconda3/envs/aida/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: 
The mathtext.fallback_to_cm rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In /home/ths/miniconda3/envs/aida/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: Support for setting the 'mathtext.fallback_to_cm' rcParam is deprecated since 3.3 and will be removed two minor releases later; use 'mathtext.fallback : 'cm' instead.
In /home/ths/miniconda3/envs/aida/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: 
The validate_bool_maybe_none function was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In /home/ths/miniconda3/envs

# Helper functions

In [2]:
def load_movie_details(df):
    for idx, row in df.iterrows():
        p_rd = get_movie_data(row["id"])
        df.loc[idx, "release_date"] = p_rd
    return df

def get_movie_data(mid: int):
    try:
        m = tmdb.Movies(mid).info()
    except Exception as e:
        print(f"Error loading movie {mid}, Exception: {sys.exc_info()}")
        return None, None

    # Get poster url
    if m['release_date'] != None:
        try:
            p_rd = pd.Timestamp(tmdb.Movies(mid).info()['release_date'])
        except Exception as e:
            print(f"No valid Timestamp for movie {mid}, release date set to None")
            p_rd = None
    else:
        print(f"Release date not set for movie {mid}")
        p_rd = None

    return p_rd


def parallelize_dataframe(df, func):
    num_cores = multiprocessing.cpu_count()-1  #leave one free to not freeze machine
    num_partitions = num_cores #number of partitions to split dataframe
    df_split = np.array_split(df, num_partitions)
    pool = multiprocessing.Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

## Example movie details dataset

In [3]:
example_movie_details = {'adult': False,
 'backdrop_path': '/ELsTifJ2lu4vsMhoHeZ5EnncHw.jpg',
 'belongs_to_collection': None,
 'budget': 70000000,
 'genres': [{'id': 35, 'name': 'Comedy'},
  {'id': 14, 'name': 'Fantasy'},
  {'id': 878, 'name': 'Science Fiction'}],
 'homepage': 'https://www.warnerbros.com/movies/mars-attacks/',
 'id': 75,
 'imdb_id': 'tt0116996',
 'original_language': 'en',
 'original_title': 'Mars Attacks!',
 'overview': "'We come in peace' is not what those green men from Mars mean when they invade our planet, armed with irresistible weapons and a cruel sense of humor.  This star studded cast must play victim to the alien’s fun and games in this comedy homage to science fiction films of the '50s and '60s.",
 'popularity': 21.773,
 'poster_path': '/hll4O5vSAfnZDb6JbnP06GPtz7b.jpg',
 'production_companies': [{'id': 8601,
   'logo_path': None,
   'name': 'Tim Burton Productions',
   'origin_country': ''},
  {'id': 174,
   'logo_path': '/ky0xOc5OrhzkZ1N6KyUxacfQsCk.png',
   'name': 'Warner Bros. Pictures',
   'origin_country': 'US'}],
 'production_countries': [{'iso_3166_1': 'US',
   'name': 'United States of America'}],
 'release_date': '1996-12-12',
 'revenue': 101371017,
 'runtime': 106,
 'spoken_languages': [{'english_name': 'English',
   'iso_639_1': 'en',
   'name': 'English'},
  {'english_name': 'French', 'iso_639_1': 'fr', 'name': 'Français'}],
 'status': 'Released',
 'tagline': "Nice planet. We'll take it!",
 'title': 'Mars Attacks!',
 'video': False,
 'vote_average': 6.3,
 'vote_count': 3948}

# Config

In [4]:
MOVIE_URI = "https://www.themoviedb.org/movie/"

FILE_DIR = "./"
DATA_DIR_RAW = FILE_DIR + "../data/raw/"
DATA_DIR_INTERIM = FILE_DIR + "../data/interim/"

VERSION_ID_OUT = "v2"
FILE_PATH_DF_IN = DATA_DIR_INTERIM + "df_cleaned_v1.gzip"
FILE_PATH_DF_OUT = DATA_DIR_INTERIM + f"df_cleaned_{VERSION_ID_OUT}.gzip"

# Init

In [5]:
# Open TDB session
tmdb.API_KEY = sec.TMDB_API_KEY
tmdb.REQUESTS_SESSION = requests.Session()

# Display options
pd.set_option('display.max_colwidth', None)

# Processing

## Load dataframe to add the release date

In [6]:
# Load dataframe to change
df = pd.read_parquet(FILE_PATH_DF_IN)

## Fetch and set release date

In [7]:
# Add release date 
#ray.init()

l_df = []
split_size = 1000
start_count = 0
end_count = (len(df) // split_size) + 1

for i in tqdm.tqdm(range(start_count, end_count)):
    s = i * split_size
    e = ((i + 1) * split_size) - 1
    if e > len(df):
        e = len(df) - 1

    df_tmp = df[s:e++1].copy()
    df_tmp = parallelize_dataframe(df_tmp, load_movie_details)
    l_df.append(df_tmp)
    df_tmp.to_parquet(DATA_DIR_RAW + f'df_{s}_{e}_{VERSION_ID_OUT}.gzip', compression='gzip')

# Put it together
df = pd.concat(l_df)
df.isna().sum()
df.head()

2021-03-15 12:56:17,695	INFO resource_spec.py:231 -- Starting Ray with 13.87 GiB memory available for workers and up to 6.94 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2021-03-15 12:56:18,147	INFO services.py:1193 -- View the Ray dashboard at [1m[32mlocalhost:8266[39m[22m
  0%|          | 0/29 [00:00<?, ?it/s]


AttributeError: 'ray._raylet.ObjectRef' object has no attribute 'to_parquet'

[2m[36m(pid=693695)[0m Error loading movie 537250, Exception: (<class 'tmdbsimple.base.APIKeyError'>, APIKeyError(), <traceback object at 0x7f1e16df6550>)


2021-03-15 12:56:24,649	ERROR worker.py:1074 -- Possible unhandled error from worker: [36mray::__main__.load_movie_details()[39m (pid=693695, ip=192.168.2.114)
  File "python/ray/_raylet.pyx", line 479, in ray._raylet.execute_task
  File "<ipython-input-2-c3439134cb78>", line 5, in load_movie_details
  File "/home/ths/miniconda3/envs/aida/lib/python3.7/site-packages/pandas/core/indexing.py", line 670, in __setitem__
    iloc._setitem_with_indexer(indexer, value)
  File "/home/ths/miniconda3/envs/aida/lib/python3.7/site-packages/pandas/core/indexing.py", line 1601, in _setitem_with_indexer
    self._setitem_with_indexer(new_indexer, value)
  File "/home/ths/miniconda3/envs/aida/lib/python3.7/site-packages/pandas/core/indexing.py", line 1667, in _setitem_with_indexer
    "cannot set using a multi-index "
ValueError: cannot set using a multi-index selection indexer with a different length than the value


## Merge files to one file

In [None]:
# Load all existing files and merge them
l_df = []
l_file = glob.glob(DATA_DIR_RAW + f"df*_{VERSION_ID_OUT}.gzip")
for file_path in l_file:
    pf = fp.ParquetFile(file_path)
    l_df.append(pf.to_pandas())
df = pd.concat(l_df)

# Save into one flea
df.to_parquet(FILE_PATH_DF_OUT, compression='gzip')

In [1]:
import ray
import time
ray.init()

2021-03-15 12:50:49,691	INFO resource_spec.py:231 -- Starting Ray with 14.11 GiB memory available for workers and up to 7.07 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2021-03-15 12:50:50,081	INFO services.py:1193 -- View the Ray dashboard at [1m[32mlocalhost:8266[39m[22m


{'node_ip_address': '192.168.2.114',
 'raylet_ip_address': '192.168.2.114',
 'redis_address': '192.168.2.114:6379',
 'object_store_address': '/tmp/ray/session_2021-03-15_12-50-49_690548_691453/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2021-03-15_12-50-49_690548_691453/sockets/raylet',
 'webui_url': 'localhost:8266',
 'session_dir': '/tmp/ray/session_2021-03-15_12-50-49_690548_691453'}

In [2]:
@ray.remote
def f(i):
    time.sleep(1)
    return i
futures = [f.remote(i) for i in range(40)]
print(ray.get(futures))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39]
