In [4]:
# import dependencies
import requests
import pandas as pd
import numpy as np
from config import api_key
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

In [5]:
movie_list = pd.read_csv('resources/cleaned/tmdb_movie_list.csv')

In [6]:
movie_list.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 563 entries, 0 to 562
Data columns (total 21 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              563 non-null    object 
 1   Release date       563 non-null    int64  
 2   Time Period        563 non-null    object 
 3   Start Time         563 non-null    int64  
 4   End Time           563 non-null    int64  
 5   Location           562 non-null    object 
 6   genre_ids          563 non-null    object 
 7   movie_id           563 non-null    int64  
 8   original_language  563 non-null    object 
 9   overview           557 non-null    object 
 10  popularity         563 non-null    float64
 11  title              563 non-null    object 
 12  video              563 non-null    bool   
 13  vote_average       563 non-null    float64
 14  vote_count         563 non-null    int64  
 15  genres             563 non-null    object 
 16  name               559 non

### Create DFs for Actors, Directors, and Studios

In [7]:
# Code from The-Final-Project_F-PALS was refractored for this project.
# The code below will make an API call based on the movie list to get the leading actors and directors of each film.

# Create list of movie ids that will be used to make API calls for additional information
id_list = movie_list.movie_id.tolist()

# Create blank DFs and lists
actors_df = pd.DataFrame()
directors_df = pd.DataFrame()
studio_df = pd.DataFrame()
actor_movie_id = []
director_movie_id = []
studio_movie_id = []

# Make API calls for movie_id to get the actors, directors, and studios for each film
for movie_id in id_list:
    movie_credits = requests.get(f'https://api.themoviedb.org/3/movie/{movie_id}/credits?api_key={api_key}&language=en-US').json()

    for actor in movie_credits['cast']:
        actors_df = actors_df.append(actor, ignore_index=True)
        actor_movie_id.append(movie_id)

    for director in movie_credits['crew']:
        if director['job'] == "Director":
            directors_df = directors_df.append(director, ignore_index=True)
            director_movie_id.append(movie_id)

    movie_studios = requests.get(f'https://api.themoviedb.org/3/movie/{movie_id}?api_key={api_key}&language=en-US').json()
    for studio in movie_studios['production_companies']:
        studio_df = studio_df.append(studio, ignore_index=True)
        studio_movie_id.append(movie_id)

# Clean the new DFs
actors_df["movie_id"] = actor_movie_id
actors = actors_df.rename(columns = {"id": "actor_id",'popularity': 'actor_popularity'})
actors = actors[['name','actor_id','gender','character','actor_popularity','movie_id']]
actors = pd.merge(actors,movie_list[['movie_id', 'title']], on='movie_id', how='left')
actors_clean = actors.drop_duplicates(subset=['name','title'])

directors_df['movie_id'] = director_movie_id
directors = directors_df.rename(columns={'id': 'director_id', 'popularity': 'director_popularity'})
directors = directors[['name','director_id','gender','director_popularity','movie_id']]
directors = pd.merge(directors,movie_list[['movie_id', 'title']], on='movie_id', how='left')
directors_clean = directors.drop_duplicates(subset=['name','title'])

studio_df['movie_id'] = studio_movie_id
studio = studio_df.drop(columns=['logo_path'])
studio = studio.rename(columns={'id': 'studio_id', 'name':'studio_name'})
studio = pd.merge(studio,movie_list[['movie_id', 'title']], on='movie_id', how='left')

studio_group = studio.groupby('studio_name',sort = False)
studio_group = studio_group['title'].agg(lambda column: ", ".join(column))
studio_group = studio_group.reset_index(name='title')
studio_group = studio_group.rename(columns={'title':'title list'})
studio_clean = studio_group.merge(studio,on='studio_name', how='left')

# Add list of actors to movie_list
actors_group = actors_clean.groupby('movie_id',sort = False)
actor_lists = actors_group['name'].agg(lambda column: ", ".join(column))
actor_lists = actor_lists.reset_index(name='name')
movie_list = movie_list.merge(actor_lists,on='movie_id', how='left')

# Add list of movies to actors
actors_clean_drop = actors_clean.drop_duplicates(subset=['name','title'])
movie_group = actors_clean_drop.groupby('name',sort = False)
movie_group = movie_group['title'].agg(lambda column: ", ".join(column))
movie_group = movie_group.reset_index(name='title')
movie_group = movie_group.rename(columns={'title':'title list'})
actors_clean = movie_group.merge(actors_clean,on='name', how='left')

# Add list of movies to directors
directors_clean_drop = directors_clean.drop_duplicates(subset=['name','title'])
movie_group_dir = directors_clean_drop.groupby('name',sort = False)
movie_group_dir = movie_group_dir['title'].agg(lambda column: ", ".join(column))
movie_group_dir = movie_group_dir.reset_index(name='title')
movie_group_dir = movie_group_dir.rename(columns={'title':'title list'})
directors_clean = movie_group_dir.merge(directors_clean,on='name', how='left')

# Count each recurrance of an actor's name and create a DF.
actor_movie_count = actors_clean.drop_duplicates(subset=['name','title'])
actor_movie_count = pd.DataFrame({'count': actor_movie_count.groupby(['name','actor_id'],sort=False).size()}).reset_index()

In [8]:
# Get keywords for each movie, create new DF
keywords = pd.DataFrame()
movie_id_keywords = []

for movie_id in id_list:
    movie = requests.get(f'https://api.themoviedb.org/3/movie/{movie_id}/keywords?api_key={api_key}').json()
    for key in movie['keywords']:
        keywords = keywords.append(key, ignore_index=True)
        movie_id_keywords.append(movie_id)

keywords['movie_id'] = movie_id_keywords
keywords['movie_id'] = keywords['movie_id'].astype(int)

keywords = keywords.rename(columns={'name':'keywords'})
keywords = keywords.drop(columns=['id'])

# Add list of keywords to movie_list by movie id.
keyword_group = keywords.groupby('movie_id',sort = False)
keyword_lists = keyword_group['keywords'].agg(lambda column: ", ".join(column))
keyword_lists = keyword_lists.reset_index(name='keywords')

movie_list = movie_list.merge(keyword_lists,on='movie_id', how='left')


### Use Geopy to Get Coordinates for Movie Location

In [9]:
# Use geopy to get longitute and latitude of movie location.
geocoder = RateLimiter(Nominatim(user_agent='sean.cary62@gmail.com').geocode, min_delay_seconds=1)
movie_list['Full Location'] = movie_list['Location'].apply(geocoder)

movie_list['latitude'] = movie_list['Full Location'].apply(lambda loc: loc.latitude if loc else None)
movie_list['longitude'] = movie_list['Full Location'].apply(lambda loc: loc.longitude if loc else None)
movie_list.head(10)

KeyboardInterrupt: 

In [None]:
# Add csv files to cleaned folder
keywords.to_csv('resources/cleaned/keywords.csv', index=False)
actors_clean.to_csv('resources/cleaned/actors_cleaned.csv', index=False)
directors_clean.to_csv('resources/cleaned/director_cleaned.csv', index=False)
studio.to_csv('resources/cleaned/studio_cleaned.csv', index=False)
actor_movie_count.to_csv('resources/cleaned/actor_movie_count.csv', index=False)
movie_list.to_csv('resources/cleaned/movie_list_cleaned.csv', index=False)