In [161]:
# import dependencies
import requests
import pandas as pd
import numpy as np
from config import api_key
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

In [162]:
# Import existing movie list
old_movie_list = pd.read_csv('resources/cleaned/movie_list_cleaned.csv')

#Import Add List
movie_list = pd.read_csv('resources/add_movies.csv')

In [163]:
# Create DF to hold tmdb data
tmdb_movie_list = pd.DataFrame()

# Loop through add list

for id in movie_list.movie_id:
    movie = pd.DataFrame.from_dict(requests.get(f'https://api.themoviedb.org/3/movie/{id}?api_key={api_key}&language=en-US').json(), orient='index').transpose()
    
    tmdb_movie_list = tmdb_movie_list.append(movie)


In [164]:
# Merge tmdb_movie_list with movie_lsit
movie_list = movie_list.merge(tmdb_movie_list, how='left', left_on=['movie_id'], right_on=['id'])
movie_list.drop(columns=['adult', 'backdrop_path', 'belongs_to_collection', 'budget', 'original_title', 
                        'poster_path', 'release_date', 'production_companies', 'production_countries', 
                        'release_date', 'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 
                        'homepage', 'imdb_id', 'movie_id'], inplace=True)
movie_list = movie_list.loc[~(movie_list['Start Time'] == 0)]
movie_list.rename(columns={'genres':'genre_ids','id':'movie_id'},inplace=True)
movie_list.reset_index(drop=True, inplace= True)
movie_list.sort_values('Release date', inplace=True)
movie_list.sort_values('Start Time', inplace=True)
movie_list['movie_id'] = movie_list.movie_id.astype(int)


In [165]:
# Create list to transform genres
genre_name = []
genre_id = []

# Get genre names from genre_id column
for list in movie_list.genre_ids:
    holder_name = []
    holder_id = []
    for dict in list:
        holder_name.append(dict['name'])
        holder_id.append(dict['id'])
    genre_name.append(holder_name)
    genre_id.append(holder_id)

movie_list['genres'] = genre_name
movie_list['genre_ids'] = genre_id

In [166]:
# Get genre list from TMDB 
genre = requests.get(f'https://api.themoviedb.org/3/genre/movie/list?api_key={api_key}&language=en-US').json()['genres']
genre_df = pd.DataFrame(genre)

genre_df.set_index(genre_df['id'],inplace = True)
genre_df = genre_df.drop('id',1)

# Make a DF that lists genres for each movie_id
genre_series = []
movie_series = []

for i in range(len(movie_list)-1):
    try:
        genres = movie_list.loc[i, 'genres']
        genres_list = (','.join(genres))
        for genre in genre_df['name']:
            if genre in genres_list:
                id = movie_list.loc[i, 'movie_id']
                movie_series.append(id)
                genre_series.append(genre)
                i=i+1
    except:
        print(f'error at {i}')

movie_genres =  pd.DataFrame({'movie_id':movie_series, 'genre':genre_series})

error at 56


In [167]:
# Code from The-Final-Project_F-PALS was refractored for this project.
# The code below will make an API call based on the movie list to get the leading actors and directors of each film.

# Create list of movie ids that will be used to make API calls for additional information
id_list = movie_list.movie_id.tolist()

# Create blank DFs and lists
actors_df = pd.DataFrame()
directors_df = pd.DataFrame()
studio_df = pd.DataFrame()
actor_movie_id = []
director_movie_id = []
studio_movie_id = []

# Make API calls for movie_id to get the actors, directors, and studios for each film
for movie_id in id_list:
    movie_credits = requests.get(f'https://api.themoviedb.org/3/movie/{movie_id}/credits?api_key={api_key}&language=en-US').json()

    for actor in movie_credits['cast']:
        actors_df = actors_df.append(actor, ignore_index=True)
        actor_movie_id.append(movie_id)

    for director in movie_credits['crew']:
        if director['job'] == "Director":
            directors_df = directors_df.append(director, ignore_index=True)
            director_movie_id.append(movie_id)

    movie_studios = requests.get(f'https://api.themoviedb.org/3/movie/{movie_id}?api_key={api_key}&language=en-US').json()
    for studio in movie_studios['production_companies']:
        studio_df = studio_df.append(studio, ignore_index=True)
        studio_movie_id.append(movie_id)

# Clean the new DFs
actors_df["movie_id"] = actor_movie_id
actors = actors_df.rename(columns = {"id": "actor_id",'popularity': 'actor_popularity'})
actors = actors[['name','actor_id','gender','character','actor_popularity','movie_id']]
actors = pd.merge(actors,movie_list[['movie_id', 'title']], on='movie_id', how='left')
actors_clean = actors.drop_duplicates(subset=['name','title'])

directors_df['movie_id'] = director_movie_id
directors = directors_df.rename(columns={'id': 'director_id', 'popularity': 'director_popularity'})
directors = directors[['name','director_id','gender','director_popularity','movie_id']]
directors = pd.merge(directors,movie_list[['movie_id', 'title']], on='movie_id', how='left')
directors_clean = directors.drop_duplicates(subset=['name','title'])

studio_df['movie_id'] = studio_movie_id
studio = studio_df.drop(columns=['logo_path'])
studio = studio.rename(columns={'id': 'studio_id', 'name':'studio_name'})
studio = pd.merge(studio,movie_list[['movie_id', 'title']], on='movie_id', how='left')

studio_group = studio.groupby('studio_name',sort = False)
studio_group = studio_group['title'].agg(lambda column: ", ".join(column))
studio_group = studio_group.reset_index(name='title')
studio_group = studio_group.rename(columns={'title':'title list'})
studio_clean = studio_group.merge(studio,on='studio_name', how='left')

# Add list of actors to movie_list
actors_group = actors_clean.groupby('movie_id',sort = False)
actor_lists = actors_group['name'].agg(lambda column: ", ".join(column))
actor_lists = actor_lists.reset_index(name='name')
movie_list = movie_list.merge(actor_lists,on='movie_id', how='left')

# Add list of movies to actors
actors_clean_drop = actors_clean.drop_duplicates(subset=['name','title'])
movie_group = actors_clean_drop.groupby('name',sort = False)
movie_group = movie_group['title'].agg(lambda column: ", ".join(column))
movie_group = movie_group.reset_index(name='title')
movie_group = movie_group.rename(columns={'title':'title list'})
actors_clean = movie_group.merge(actors_clean,on='name', how='left')

# Add list of movies to directors
directors_clean_drop = directors_clean.drop_duplicates(subset=['name','title'])
movie_group_dir = directors_clean_drop.groupby('name',sort = False)
movie_group_dir = movie_group_dir['title'].agg(lambda column: ", ".join(column))
movie_group_dir = movie_group_dir.reset_index(name='title')
movie_group_dir = movie_group_dir.rename(columns={'title':'title list'})
directors_clean = movie_group_dir.merge(directors_clean,on='name', how='left')

# Count each recurrance of an actor's name and create a DF.
actor_movie_count = actors_clean.drop_duplicates(subset=['name','title'])
actor_movie_count = pd.DataFrame({'count': actor_movie_count.groupby(['name','actor_id'],sort=False).size()}).reset_index()

In [168]:
# Get keywords for each movie, create new DF
keywords = pd.DataFrame()
movie_id_keywords = []

for movie_id in id_list:
    movie = requests.get(f'https://api.themoviedb.org/3/movie/{movie_id}/keywords?api_key={api_key}').json()
    for key in movie['keywords']:
        keywords = keywords.append(key, ignore_index=True)
        movie_id_keywords.append(movie_id)

keywords['movie_id'] = movie_id_keywords
keywords['movie_id'] = keywords['movie_id'].astype(int)

keywords = keywords.rename(columns={'name':'keywords'})
keywords = keywords.drop(columns=['id'])

# Add list of keywords to movie_list by movie id.
keyword_group = keywords.groupby('movie_id',sort = False)
keyword_lists = keyword_group['keywords'].agg(lambda column: ", ".join(column))
keyword_lists = keyword_lists.reset_index(name='keywords')

movie_list = movie_list.merge(keyword_lists,on='movie_id', how='left')


In [169]:
# Use geopy to get longitute and latitude of movie location.
geocoder = RateLimiter(Nominatim(user_agent='sean.cary62@gmail.com').geocode, min_delay_seconds=1)
movie_list['Full Location'] = movie_list['Location'].apply(geocoder)

movie_list['latitude'] = movie_list['Full Location'].apply(lambda loc: loc.latitude if loc else None)
movie_list['longitude'] = movie_list['Full Location'].apply(lambda loc: loc.longitude if loc else None)
movie_list.head(10)

Unnamed: 0,Title,Release date,Time Period,Start Time,End Time,Location,genre_ids,movie_id,original_language,overview,...,title,video,vote_average,vote_count,genres,name,keywords,Full Location,latitude,longitude
0,Homo Erectus,2007,50000 BC,-50000,-50000,Europe,[35],14641,en,Ishbo is a caveman living in the prehistoric a...,...,Homo Erectus,False,3.9,25,[Comedy],"Sasha Grey, David Carradine, Carol Alt, Talia ...","national lampoon serie, prehistoric times","(أوروبا, (51.0, 10.0))",51.0,10.0
1,The Tribe,1974,50000 BC,-50000,-50000,Europe,[],338039,en,"In the last Ice Age, a family of Cro-Magnons b...",...,The Tribe,False,5.0,2,[],,,"(أوروبا, (51.0, 10.0))",51.0,10.0
2,The Scorpion King 3: Battle for Redemption,2012,3100 BC,-3100,-3100,Egypt,"[28, 12, 14, 18]",78049,en,Since his triumphant rise to power in the orig...,...,The Scorpion King 3: Battle for Redemption,False,4.5,258,"[Action, Adventure, Fantasy, Drama]","Victor Webster, Bostin Christopher, Temuera Mo...",,"(مصر, (26.2540493, 29.2675469))",26.254049,29.267547
3,Troilus & Cressida,1981,1305 BC,-1305,-1305,Anatolia,[18],119913,en,The bitter Trojan War drags on - the Greeks bl...,...,Troilus & Cressida,False,8.0,1,[Drama],"Anton Lesser, Suzanne Burden, Charles Gray, Be...","trojan war, ancient greece","(Asia Minor, İç Anadolu Bölgesi, Türkiye, (39....",39.057421,32.311238
4,Moses,1995,1300 BC,-1300,-1300,Egypt,[18],2719,en,An ordinary man is called upon by God to do th...,...,Moses,False,6.4,27,[Drama],"Ben Kingsley, Frank Langella, Christopher Lee,...","moses, ark of the covenant, ten commandments, ...","(مصر, (26.2540493, 29.2675469))",26.254049,29.267547
5,Las Troyanas,1963,1300 BC,-1300,-1300,Anatolia,[],533220,es,Adaptation of Euripides: lamentations of the w...,...,Las Troyanas,False,6.0,4,[],"Ofelia Guilmáin, Erna Martha Bauman, Mercedes ...",,"(Asia Minor, İç Anadolu Bölgesi, Türkiye, (39....",39.057421,32.311238
6,The Lion of Thebes,1964,1300 BC,-1300,-1300,Anatolia,"[18, 28, 12]",211139,it,"Fleeing Troy in the wake of its destruction, f...",...,The Lion of Thebes,False,8.8,4,"[Drama, Action, Adventure]","Mark Forest, Yvonne Furneaux, Massimo Serato, ...","sword and sandal, peplum","(Asia Minor, İç Anadolu Bölgesi, Türkiye, (39....",39.057421,32.311238
7,Fury of Achilles,1962,1300 BC,-1300,-1300,Anatolia,"[10752, 12, 36, 18, 10749]",81409,it,"In the tenth year of the Trojan War, tensions ...",...,Fury of Achilles,False,6.7,3,"[War, Adventure, History, Drama, Romance]","Gordon Mitchell, Jacques Bergerac, Mario Petri...",,"(Asia Minor, İç Anadolu Bölgesi, Türkiye, (39....",39.057421,32.311238
8,The Ten Commandments: The Movie,2016,1250 BC,-1250,-1250,Egypt,[18],372519,pt,Follows Moses leading and conducting the Hebre...,...,The Ten Commandments: The Movie,False,7.3,238,[Drama],"Guilherme Winter, Camila Rodrigues, Sergio Mar...",,"(مصر, (26.2540493, 29.2675469))",26.254049,29.267547
9,Troy the Odyssey,2017,1174 BC,-1174,-1174,Troy,"[12, 28]",493416,en,"In this re-telling of Iliad, set in 1174 B.C. ...",...,Troy the Odyssey,False,4.8,16,"[Adventure, Action]","Dylan Vox, Lara Heller, Hachem Hicham, Eoin O'...",,"(Troya'nın Arkeolojik Alanı, 17-56, Tevfikiye,...",39.957374,26.238017


In [170]:
# Combine old movie list with new movie list
movie_list = old_movie_list.append(movie_list)
movie_list = movie_list.drop_duplicates(subset=['movie_id'])
movie_list.sort_values('Release date', inplace=True)
movie_list.sort_values('Start Time', inplace=True)
movie_list.reset_index(drop=True, inplace= True)

In [171]:
# Import old csv files and append new data
actors_clean_old = pd.read_csv('resources/cleaned/actors_cleaned.csv')
actors_clean = actors_clean_old.append(actors_clean)
actors_clean = actors_clean.drop_duplicates(subset=['actor_id','character','movie_id'])
actors_clean.sort_values('name', inplace=True)
actors_clean.reset_index(drop=True, inplace= True)

directors_clean_old = pd.read_csv('resources/cleaned/director_cleaned.csv')
directors_clean = directors_clean_old.append(directors_clean)
directors_clean.drop_duplicates(subset=['director_id','movie_id'])
directors_clean.sort_values('name', inplace=True)
directors_clean.reset_index(drop=True, inplace= True)

studio_clean_old = pd.read_csv('resources/cleaned/studio_cleaned.csv')
studio_clean = studio_clean_old.append(studio_clean)
studio_clean.drop_duplicates(subset=['studio_id', 'movie_id'])
studio_clean.sort_values('studio_name', inplace=True)
studio_clean.reset_index(drop=True, inplace= True)

movie_genres_old = pd.read_csv('resources/cleaned/movie_genres.csv')
movie_genres_clean = movie_genres_old.append(movie_genres)
movie_genres_clean.drop_duplicates(subset=['movie_id','genre'])
movie_genres_clean.sort_values('movie_id', inplace=True)
movie_genres_clean.reset_index(drop=True, inplace= True)

keywords_old = pd.read_csv('resources/cleaned/keywords.csv')
keywords = keywords_old.append(keywords)
keywords.drop_duplicates(subset=['keywords','movie_id'])
keywords.sort_values('movie_id', inplace=True)
keywords.reset_index(drop=True, inplace= True)

# Count each recurrance of an actor's name and create a DF.
actor_movie_count = pd.DataFrame({'count': actors_clean.groupby(['name','actor_id'],sort=False).size()}).reset_index()

In [172]:
# Add csv files to cleaned folder
keywords.to_csv('resources/cleaned/keywords.csv', index=False)
actors_clean.to_csv('resources/cleaned/actors_cleaned.csv', index=False)
directors_clean.to_csv('resources/cleaned/director_cleaned.csv', index=False)
studio.to_csv('resources/cleaned/studio_cleaned.csv', index=False)
actor_movie_count.to_csv('resources/cleaned/actor_movie_count.csv', index=False)
movie_list.to_csv('resources/cleaned/movie_list_cleaned.csv', index=False)
movie_genres.to_csv('resources/cleaned/movie_genres.csv', index=False)