In [18]:
# import dependencies
import requests
import pandas as pd
import numpy as np
from config import api_key
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

In [19]:
movie_list = pd.read_csv('resources/cleaned/tmdb_movie_list.csv')

In [20]:
movie_list.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 622 entries, 0 to 621
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              622 non-null    object 
 1   Release date       622 non-null    int64  
 2   Time Period        622 non-null    object 
 3   Start Time         622 non-null    int64  
 4   End Time           622 non-null    int64  
 5   Location           621 non-null    object 
 6   genre_ids          622 non-null    object 
 7   movie_id           622 non-null    int64  
 8   original_language  622 non-null    object 
 9   overview           615 non-null    object 
 10  popularity         622 non-null    float64
 11  title              622 non-null    object 
 12  video              622 non-null    bool   
 13  vote_average       622 non-null    float64
 14  vote_count         622 non-null    int64  
 15  genres             622 non-null    object 
dtypes: bool(1), float64(2), in

### Create DFs for Actors, Directors, and Studios

In [21]:
# Code from The-Final-Project_F-PALS was refractored for this project.
# The code below will make an API call based on the movie list to get the leading actors and directors of each film.

# Create list of movie ids that will be used to make API calls for additional information
id_list = movie_list.movie_id.tolist()

# Create blank DFs and lists
actors_df = pd.DataFrame()
directors_df = pd.DataFrame()
studio_df = pd.DataFrame()
actor_movie_id = []
director_movie_id = []
studio_movie_id = []

# Make API calls for movie_id to get the actors, directors, and studios for each film
for movie_id in id_list:
    movie_credits = requests.get(f'https://api.themoviedb.org/3/movie/{movie_id}/credits?api_key={api_key}&language=en-US').json()
    
    for actor in movie_credits['cast']:
        actors_df = actors_df.append(actor, ignore_index=True)
        actor_movie_id.append(movie_id)

    for director in movie_credits['crew']:
        if director['job'] == "Director":
            directors_df = directors_df.append(director, ignore_index=True)
            director_movie_id.append(movie_id)

    movie_studios = requests.get(f'https://api.themoviedb.org/3/movie/{movie_id}?api_key={api_key}&language=en-US').json()
    for studio in movie_studios['production_companies']:
        studio_df = studio_df.append(studio, ignore_index=True)
        studio_movie_id.append(movie_id)

# Clean the new DFs
actors_df["movie_id"] = actor_movie_id
actors = actors_df.rename(columns = {"id": "actor_id",'popularity': 'actor_popularity'})
actors = actors[['name','actor_id','gender','character','actor_popularity','movie_id']]
actors = pd.merge(actors,movie_list[['movie_id', 'title']], on='movie_id', how='left')
actors_clean = actors.drop_duplicates(subset=['name','title'])

directors_df['movie_id'] = director_movie_id
directors = directors_df.rename(columns={'id': 'director_id', 'popularity': 'director_popularity'})
directors = directors[['name','director_id','gender','director_popularity','movie_id']]
directors = pd.merge(directors,movie_list[['movie_id', 'title']], on='movie_id', how='left')
directors_clean = directors.drop_duplicates(subset=['name','title'])

studio_df['movie_id'] = studio_movie_id
studio = studio_df.drop(columns=['logo_path'])
studio = studio.rename(columns={'id': 'studio_id', 'name':'studio_name'})
studio = pd.merge(studio,movie_list[['movie_id', 'title']], on='movie_id', how='left')

studio_group = studio.groupby('studio_name',sort = False)
studio_group = studio_group['title'].agg(lambda column: ", ".join(column))
studio_group = studio_group.reset_index(name='title')
studio_group = studio_group.rename(columns={'title':'title list'})
studio_clean = studio_group.merge(studio,on='studio_name', how='left')

# Add list of actors to movie_list
actors_group = actors_clean.groupby('movie_id',sort = False)
actor_lists = actors_group['name'].agg(lambda column: ", ".join(column))
actor_lists = actor_lists.reset_index(name='name')
movie_list = movie_list.merge(actor_lists,on='movie_id', how='left')

# Add list of movies to actors
actors_clean_drop = actors_clean.drop_duplicates(subset=['name','title'])
movie_group = actors_clean_drop.groupby('name',sort = False)
movie_group = movie_group['title'].agg(lambda column: ", ".join(column))
movie_group = movie_group.reset_index(name='title')
movie_group = movie_group.rename(columns={'title':'title list'})
actors_clean = movie_group.merge(actors_clean,on='name', how='left')

# Add list of movies to directors
directors_clean_drop = directors_clean.drop_duplicates(subset=['name','title'])
movie_group_dir = directors_clean_drop.groupby('name',sort = False)
movie_group_dir = movie_group_dir['title'].agg(lambda column: ", ".join(column))
movie_group_dir = movie_group_dir.reset_index(name='title')
movie_group_dir = movie_group_dir.rename(columns={'title':'title list'})
directors_clean = movie_group_dir.merge(directors_clean,on='name', how='left')

# Count each recurrance of an actor's name and create a DF.
actor_movie_count = actors_clean.drop_duplicates(subset=['name','title'])
actor_movie_count = pd.DataFrame({'count': actor_movie_count.groupby(['name','actor_id'],sort=False).size()}).reset_index()

In [22]:
# Get keywords for each movie, create new DF
keywords = pd.DataFrame()
movie_id_keywords = []

for movie_id in id_list:
    movie = requests.get(f'https://api.themoviedb.org/3/movie/{movie_id}/keywords?api_key={api_key}').json()
    for key in movie['keywords']:
        keywords = keywords.append(key, ignore_index=True)
        movie_id_keywords.append(movie_id)

keywords['movie_id'] = movie_id_keywords
keywords['movie_id'] = keywords['movie_id'].astype(int)

keywords = keywords.rename(columns={'name':'keywords'})
keywords = keywords.drop(columns=['id'])

# Add list of keywords to movie_list by movie id.
keyword_group = keywords.groupby('movie_id',sort = False)
keyword_lists = keyword_group['keywords'].agg(lambda column: ", ".join(column))
keyword_lists = keyword_lists.reset_index(name='keywords')

movie_list = movie_list.merge(keyword_lists,on='movie_id', how='left')


### Use Geopy to Get Coordinates for Movie Location

In [23]:
# Use geopy to get longitute and latitude of movie location.
geocoder = RateLimiter(Nominatim(user_agent='sean.cary62@gmail.com').geocode, min_delay_seconds=1)
movie_list['Full Location'] = movie_list['Location'].apply(geocoder)

movie_list['latitude'] = movie_list['Full Location'].apply(lambda loc: loc.latitude if loc else None)
movie_list['longitude'] = movie_list['Full Location'].apply(lambda loc: loc.longitude if loc else None)
movie_list.head(10)

Unnamed: 0,Title,Release date,Time Period,Start Time,End Time,Location,genre_ids,movie_id,original_language,overview,...,title,video,vote_average,vote_count,genres,name,keywords,Full Location,latitude,longitude
0,Quest for Fire,1981,"80,000 BC",-80000,-80000,Europe,"[12, 18]",62204,fr,A colossal adventure odyssey that turns back t...,...,Quest for Fire,False,7.1,364,"['Adventure', 'Drama']","Everett McGill, Ron Perlman, Nicholas Kadi, Ra...","fire, based on novel or book, mammoth, stone a...","(Ōu, (51.0, 10.0))",51.0,10.0
1,The Clan of the Cave Bear,1986,"40,000 - 35,000 BC",-40000,-40000,Europe,"[12, 18]",13853,en,"Natural changes have the clans moving. Iza, me...",...,The Clan of the Cave Bear,False,5.1,88,"['Adventure', 'Drama']","Daryl Hannah, Pamela Reed, James Remar, Thomas...","stone age, animal attack, tribe, bear, cavemen...","(Ōu, (51.0, 10.0))",51.0,10.0
2,Alpha,2018,"20,000 BC",-20000,-20000,Europe,"[12, 18]",399360,en,"In the prehistoric past, Keda, a young and ine...",...,Alpha,False,6.4,2112,"['Adventure', 'Drama']","Kodi Smit-McPhee, Jóhannes Haukur Jóhannesson,...","wolf, ice age, human animal relationship, wild...","(Ōu, (51.0, 10.0))",51.0,10.0
3,"10,000 BC",2008,"10,000 BC",-10000,-10000,Europe,"[12, 28, 18, 14]",7840,en,A prehistoric epic that follows a young mammot...,...,"10,000 BC",False,5.4,2406,"['Adventure', 'Action', 'Drama', 'Fantasy']","Steven Strait, Camilla Belle, Cliff Curtis, Na...","hunter, indigenous, lover (female), pyramid, m...","(Ōu, (51.0, 10.0))",51.0,10.0
4,The Scorpion King 2: Rise of a Warrior,2008,2334-2284 BC/ 1792 - c. 1750 BC,-3210,-3210,Egypt,"[28, 12, 14, 878]",13486,en,The heroic tale of young Mathayus and his rele...,...,The Scorpion King 2: Rise of a Warrior,False,4.6,460,"['Action', 'Adventure', 'Fantasy', 'Science Fi...","Michael Copon, Randy Couture, Karen David, Sim...","magic, mercenary, training, greek, sword fight...","(مصر, (26.2540493, 29.2675469))",26.254049,29.267547
5,The Scorpion King,2002,3200-3000 BC,-3200,-3200,Egypt,"[28, 12, 14]",9334,en,"In ancient Egypt, peasant Mathayus is hired to...",...,The Scorpion King,False,5.5,2525,"['Action', 'Adventure', 'Fantasy']","Dwayne Johnson, Kelly Hu, Michael Clarke Dunca...","egypt, temple, magic, prequel, sword fight, ba...","(مصر, (26.2540493, 29.2675469))",26.254049,29.267547
6,The Pharaohs' Woman,1960,3100 BC,-3100,-3100,Egypt,[],330623,en,A beautiful girl and a young physician fall in...,...,The Pharaohs' Woman,False,0.0,0,[],,,"(مصر, (26.2540493, 29.2675469))",26.254049,29.267547
7,Sudan,1945,2558-2532 BC,-2558,-2532,Egypt,"[12, 28, 10749]",137584,en,"A desert pickpocket, his sidekick, and an esca...",...,Sudan,False,6.8,4,"['Adventure', 'Action', 'Romance']","Maria Montez, Jon Hall, Turhan Bey, Andy Devin...","pickpocket, ancient egypt, escaped slave","(مصر, (26.2540493, 29.2675469))",26.254049,29.267547
8,Mohenjo Daro,2016,2500 - 2000 BC,-2500,-2500,Mohenjo-daro,"[12, 18, 36, 10749]",402672,hi,"During the Indus valley civilization, an Indig...",...,Mohenjo Daro,False,6.4,61,"['Adventure', 'Drama', 'History', 'Romance']","Hrithik Roshan, Pooja Hegde, Kabir Bedi, Aruno...","crocodile, india, ancient civilization, bronze...","(Mohenjo-daro VS area, Moenjo Daro Road, سندھ,...",27.321657,68.136593
9,Mohenjo Daro,2016,2500 - 2000 BC,-2500,-2500,Mohenjo-daro,"[12, 18, 36, 10749]",402672,hi,"During the Indus valley civilization, an Indig...",...,Mohenjo Daro,False,6.4,61,"['Adventure', 'Drama', 'History', 'Romance']","Hrithik Roshan, Pooja Hegde, Kabir Bedi, Aruno...","crocodile, india, ancient civilization, bronze...","(Mohenjo-daro VS area, Moenjo Daro Road, سندھ,...",27.321657,68.136593


### Get Actor Recurrance Score (ARS)

#### Get sum of movies for each actor on the timeline. The sum of any actor who has appeared in more than 1 movie contributes to the the ARS score.

In [24]:
# Get count of recurrance for each actor
# Count each recurrance of an actor's name and create a DF.
actor_movie_count = actors_clean.drop_duplicates(subset=['name','title'])
actor_movie_count = pd.DataFrame({'count': actor_movie_count.groupby(['name','actor_id'],sort=False).size()}).reset_index()
actor_movie_count['count'].replace({1:0},inplace=True)

# Combine Wiki DF with TMDB DF
actors_clean = actors_clean.merge(actor_movie_count, how='left', left_on=['actor_id','name'], right_on=['actor_id','name'])
actor_recurrance_score = pd.DataFrame({'ARS':actors_clean.groupby('movie_id')['count'].sum()}).reset_index()
actor_recurrance_score.head()

# Add ARS to movie_list DF
movie_list = movie_list.merge(actor_recurrance_score, how='left', left_on=['movie_id'], right_on=['movie_id'])
movie_list.sort_values(by=['ARS'],ascending=False, inplace=True)
# movie_list_2.drop_duplicates(subset=['movie_id'],inplace=True)
movie_list.sort_values(by=['Start Time'],inplace=True)
movie_list.reset_index(drop=True, inplace=True)

In [25]:
# Add csv files to cleaned folder
keywords.to_csv('resources/cleaned/keywords.csv', index=False)
actors_clean.to_csv('resources/cleaned/actors_cleaned.csv', index=False)
directors_clean.to_csv('resources/cleaned/director_cleaned.csv', index=False)
studio.to_csv('resources/cleaned/studio_cleaned.csv', index=False)
actor_movie_count.to_csv('resources/cleaned/actor_movie_count.csv', index=False)
movie_list.to_csv('resources/cleaned/movie_list_cleaned.csv', index=False)