# Add movie release date to existing dataframe
Release dates are merged to the already existing file "df_clean_v1.gzip", that means
only release dates for processable movies are fetched.

In [8]:
import pandas as pd
import numpy as np
import tmdbsimple as tmdb
import json
import requests
import tqdm
import re
import multiprocessing
import glob
import fastparquet as fp
import sys
import src.helper.helper as hlp
import src.helper.secret as sec

# Helper functions

In [9]:
def load_movie_details(df):
    for idx, row in df.iterrows():
        p_rd = get_movie_data(row["id"])
        df.loc[idx, "release_date"] = p_rd
    return df

def get_movie_data(mid: int):
    try:
        m = tmdb.Movies(mid).info()
    except Exception as e:
        print(f"Error loading movie {mid}, Exception: {sys.exc_info()}")
        return None, None

    # Get poster url
    if m['release_date'] != None:
        try:
            p_rd = pd.Timestamp(tmdb.Movies(mid).info()['release_date'])
        except Exception as e:
            print(f"No valid Timestamp for movie {mid}, release date set to None")
            p_rd = None
    else:
        print(f"Release date not set for movie {mid}")
        p_rd = None

    return p_rd


def parallelize_dataframe(df, func):
    num_cores = multiprocessing.cpu_count()-1  #leave one free to not freeze machine
    num_partitions = num_cores #number of partitions to split dataframe
    df_split = np.array_split(df, num_partitions)
    pool = multiprocessing.Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

## Example movie details dataset

In [10]:
example_movie_details = {'adult': False,
 'backdrop_path': '/ELsTifJ2lu4vsMhoHeZ5EnncHw.jpg',
 'belongs_to_collection': None,
 'budget': 70000000,
 'genres': [{'id': 35, 'name': 'Comedy'},
  {'id': 14, 'name': 'Fantasy'},
  {'id': 878, 'name': 'Science Fiction'}],
 'homepage': 'https://www.warnerbros.com/movies/mars-attacks/',
 'id': 75,
 'imdb_id': 'tt0116996',
 'original_language': 'en',
 'original_title': 'Mars Attacks!',
 'overview': "'We come in peace' is not what those green men from Mars mean when they invade our planet, armed with irresistible weapons and a cruel sense of humor.  This star studded cast must play victim to the alien’s fun and games in this comedy homage to science fiction films of the '50s and '60s.",
 'popularity': 21.773,
 'poster_path': '/hll4O5vSAfnZDb6JbnP06GPtz7b.jpg',
 'production_companies': [{'id': 8601,
   'logo_path': None,
   'name': 'Tim Burton Productions',
   'origin_country': ''},
  {'id': 174,
   'logo_path': '/ky0xOc5OrhzkZ1N6KyUxacfQsCk.png',
   'name': 'Warner Bros. Pictures',
   'origin_country': 'US'}],
 'production_countries': [{'iso_3166_1': 'US',
   'name': 'United States of America'}],
 'release_date': '1996-12-12',
 'revenue': 101371017,
 'runtime': 106,
 'spoken_languages': [{'english_name': 'English',
   'iso_639_1': 'en',
   'name': 'English'},
  {'english_name': 'French', 'iso_639_1': 'fr', 'name': 'Français'}],
 'status': 'Released',
 'tagline': "Nice planet. We'll take it!",
 'title': 'Mars Attacks!',
 'video': False,
 'vote_average': 6.3,
 'vote_count': 3948}

# Config

In [16]:
MOVIE_URI = "https://www.themoviedb.org/movie/"

FILE_DIR = "./"
DATA_DIR_RAW = FILE_DIR + "../data/raw/"
DATA_DIR_INTERIM = FILE_DIR + "../data/interim/"

VERSION_ID_OUT = "v2"
FILE_PATH_DF_IN = DATA_DIR_INTERIM + "df_cleaned_v1.gzip"
FILE_PATH_DF_OUT = DATA_DIR_INTERIM + f"df_cleaned_{VERSION_ID_OUT}.gzip"

# Init

In [12]:
# Open TDB session
tmdb.API_KEY = sec.TMDB_API_KEY
tmdb.REQUESTS_SESSION = requests.Session()

# Display options
pd.set_option('display.max_colwidth', None)

# Processing

## Load dataframe to add the release date

In [6]:
# Load dataframe to change
df = pd.read_parquet(FILE_PATH_DF_IN)

## Fetch and set release date

In [None]:
# Add release date

l_df = []
split_size = 1000
start_count = 0
end_count = (len(df) // split_size) + 1

for i in tqdm.tqdm(range(start_count, end_count)):
    s = i * split_size
    e = ((i + 1) * split_size) - 1
    if e > len(df):
        e = len(df) - 1

    df_tmp = df[s:e++1].copy()
    df_tmp = parallelize_dataframe(df_tmp, load_movie_details)
    l_df.append(df_tmp)
    df_tmp.to_parquet(DATA_DIR_RAW + f'df_{s}_{e}_{VERSION_ID_OUT}.gzip', compression='gzip')

# Put it together
df = pd.concat(l_df)
df.isna().sum()
df.head()

## Merge files to one file

In [18]:
# Load all existing files and merge them
l_df = []
l_file = glob.glob(DATA_DIR_RAW + f"df_*_{VERSION_ID_OUT}.gzip")
for file_path in l_file:
    pf = fp.ParquetFile(file_path)
    l_df.append(pf.to_pandas())
df = pd.concat(l_df)

# Save into one flea
df.to_parquet(FILE_PATH_DF_OUT, compression='gzip')

In [20]:
df.head()

Unnamed: 0,adult,id,original_title,popularity,video,url,poster_url,genre_id,poster_exists,genre_ids2,genre_id_count,release_date
465871,False,649583,The Grass Ceiling,0.6,False,https://www.themoviedb.org/movie/649583,https://www.themoviedb.org/t/p/w500//koxKUUCOE15D6cd2IW1CBnqfJu1.jpg,[99],True,[Documentary],1,2019-11-16
465872,False,649584,Alleycats,1.029,False,https://www.themoviedb.org/movie/649584,https://www.themoviedb.org/t/p/w500//e7u2toV4ZqKszvVY9I3bUmJMgma.jpg,"[28, 16]",True,"[Action,Animation]",2,2017-11-15
465874,False,649586,Качалка,0.6,False,https://www.themoviedb.org/movie/649586,https://www.themoviedb.org/t/p/w500//b3w7FK12ahYSpAWqzsZJip1WYNf.jpg,[99],True,[Documentary],1,2020-06-17
465875,False,649587,Christy,0.6,False,https://www.themoviedb.org/movie/649587,https://www.themoviedb.org/t/p/w500//p7XaE0Wh7HLaxbtAu1KCWvCGvjl.jpg,[18],True,[Drama],1,2019-11-16
465876,False,649588,Rubinstein in Concert,0.6,True,https://www.themoviedb.org/movie/649588,https://www.themoviedb.org/t/p/w500//6zY6k4PcF5DMzsSTpdQO9XnCFiV.jpg,[10402],True,[Music],1,1973-06-01
