# Import Starter Movie List

In [1]:
# import dependencies
import requests
import pandas as pd
import numpy as np
from config import api_key
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

In [2]:
# Read in movies from Wikipedia
wiki_movie_list = pd.read_csv('resources/wiki_movie_list.csv')
wiki_movie_list.head()

wiki_movie_list.drop(columns=['Diff', 'Notes on setting'], inplace=True)


### Get Additional Data from TMDB
TMDB is a community built Movie and TV Database.

In [3]:
# Create list of titles to cycle through TMDB API.
title_list = wiki_movie_list.Title
tmdb_movie_list = pd.DataFrame()
new_titles = []

# Transform titles for query

for i in wiki_movie_list.index:
#for i in range(0,30):
    try:
        title = wiki_movie_list['Title'][i]
        year = wiki_movie_list['Release date'][i]

        # Remove anything in parenthesis and brackets
        start_1 = title.find('(')
        end_1 = title.find(')')+1

        start_2 = title.find('[')
        end_2 = title.find(']')+1
        
        sub_1=title[start_1:end_1]
        sub_2=title[start_2:end_2]

        title = title.replace(sub_1,'')
        title = title.replace(sub_2,'')
        new_titles.append(title)

        
       # Replace spaces with '+'
        title = title.replace(' ','+')

        # Query TMDB data base for movies on wiki list
        movie = pd.DataFrame(requests.get(f'https://api.themoviedb.org/3/search/movie?api_key={api_key}&year={year}&query={title}').json()['results'])

        tmdb_movie_list = tmdb_movie_list.append(movie)
    except:
        print(f'error on index#: {i}')
    # Track Progress
    if i % 100 == 0:
        print(f'movies searched: {i}')

# Clean new DF
# tmdb_movie_list = tmdb_movie_list.drop_duplicates(subset=['title','release_date'])
tmdb_movie_list['release_year'] = tmdb_movie_list['release_date'].astype(str).str[0:4].astype(int)

# Combine Wiki DF with TMDB DF
movie_list = wiki_movie_list.merge(tmdb_movie_list, how='left', left_on=['Title','Release date'], right_on=['title','release_year'])
movie_list.drop(columns=['adult', 'backdrop_path', 'original_title', 'poster_path', 'release_date', 'release_year'], inplace=True)
movie_list = movie_list.loc[~(movie_list['Start Time'] == 0)]
movie_list.dropna(subset=['id'], inplace=True)
movie_list.reset_index(drop=True, inplace= True)
movie_list.rename(columns={'id':'movie_id'}, inplace=True)
movie_list.sort_values('Release date', inplace=True)
movie_list.sort_values('Start Time', inplace=True)
movie_list['movie_id'] = movie_list.movie_id.astype(int)
movie_list.head()


movies searched: 0
movies searched: 100
movies searched: 200
movies searched: 300
movies searched: 400
movies searched: 500
movies searched: 600
movies searched: 700
movies searched: 800
movies searched: 900
movies searched: 1000
movies searched: 1100


Unnamed: 0,Title,Release date,Time Period,Start Time,End Time,Location,genre_ids,movie_id,original_language,overview,popularity,title,video,vote_average,vote_count
450,Quest for Fire,1981,"80,000 BC",-80000,-80000,Europe,"[12, 18]",62204,fr,A colossal adventure odyssey that turns back t...,12.813,Quest for Fire,False,7.1,361.0
0,The Clan of the Cave Bear,1986,"40,000 - 35,000 BC",-40000,-40000,Europe,"[12, 18]",13853,en,"Natural changes have the clans moving. Iza, me...",14.848,The Clan of the Cave Bear,False,5.1,87.0
618,Alpha,2018,"20,000 BC",-20000,-20000,Europe,"[12, 18]",399360,en,"In the prehistoric past, Keda, a young and ine...",40.553,Alpha,False,6.4,2099.0
564,"10,000 BC",2008,"10,000 BC",-10000,-10000,Europe,"[12, 28, 18, 14]",7840,en,A prehistoric epic that follows a young mammot...,29.338,"10,000 BC",False,5.4,2391.0
607,Gods of Egypt,2016,3200 BC,-3200,-3200,Egypt,"[28, 12, 14]",205584,en,A common thief joins a mythical god on a quest...,52.594,Gods of Egypt,False,5.6,3372.0


In [None]:
movie_list.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 622 entries, 450 to 79
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              622 non-null    object 
 1   Release date       622 non-null    int64  
 2   Time Period        622 non-null    object 
 3   Start Time         622 non-null    int64  
 4   End Time           622 non-null    int64  
 5   Location           621 non-null    object 
 6   genre_ids          622 non-null    object 
 7   movie_id           622 non-null    int32  
 8   original_language  622 non-null    object 
 9   overview           622 non-null    object 
 10  popularity         622 non-null    float64
 11  title              622 non-null    object 
 12  video              622 non-null    object 
 13  vote_average       622 non-null    float64
 14  vote_count         622 non-null    float64
dtypes: float64(3), int32(1), int64(3), object(8)
memory usage: 75.3+ KB


### Create DFs for Actors, Directors, and Studios

In [None]:
# Code from The-Final-Project_F-PALS was refractored for this project.
# The code below will make an API call based on the movie list to get the leading actors and directors of each film.

# Create list of movie ids that will be used to make API calls for additional information
id_list = movie_list.movie_id.tolist()

# Create blank DFs and lists
actors_df = pd.DataFrame()
directors_df = pd.DataFrame()
studio_df = pd.DataFrame()
actor_movie_id = []
director_movie_id = []
studio_movie_id = []

# Make API calls for movie_id to get the actors, directors, and studios for each film
for movie_id in id_list:
    movie_credits = requests.get(f'https://api.themoviedb.org/3/movie/{movie_id}/credits?api_key={api_key}&language=en-US').json()

    for actor in movie_credits['cast']:
        actors_df = actors_df.append(actor, ignore_index=True)
        actor_movie_id.append(movie_id)

    for director in movie_credits['crew']:
        if director['job'] == "Director":
            directors_df = directors_df.append(director, ignore_index=True)
            director_movie_id.append(movie_id)

    movie_studios = requests.get(f'https://api.themoviedb.org/3/movie/{movie_id}?api_key={api_key}&language=en-US').json()
    for studio in movie_studios['production_companies']:
        studio_df = studio_df.append(studio, ignore_index=True)
        studio_movie_id.append(movie_id)

# Clean the new DFs
actors_df["movie_id"] = actor_movie_id
directors_df['movie_id'] = director_movie_id
actors = actors_df.rename(columns = {"id": "actor_id",'popularity': 'actor_popularity'})
actors = actors[['name','actor_id','gender','character','actor_popularity','movie_id']]
actors_clean = pd.merge(actors,movie_list[['movie_id', 'title']], on='movie_id', how='left')

directors_df['movie_id'] = director_movie_id
directors = directors_df.rename(columns={'id': 'director_id', 'popularity': 'director_popularity'})
directors = directors[['name','director_id','gender','director_popularity','movie_id']]
directors_clean = pd.merge(directors,movie_list[['movie_id', 'title']], on='movie_id', how='left')

studio_df['movie_id'] = studio_movie_id
studio = studio_df.drop(columns=['logo_path'])
studio = studio.rename(columns={'id': 'studio_id', 'name':'studio_name'})
studio = pd.merge(studio,movie_list[['movie_id', 'title']], on='movie_id', how='left')

studio_group = studio.groupby('studio_name',sort = False)
studio_group = studio_group['title'].agg(lambda column: ", ".join(column))
studio_group = studio_group.reset_index(name='title')
studio_group = studio_group.rename(columns={'title':'title list'})
studio_clean = studio_group.merge(studio,on='studio_name', how='left')

# Add list of actors to movie_list
actors_group = actors_clean.groupby('movie_id',sort = False)
actor_lists = actors_group['name'].agg(lambda column: ", ".join(column))
actor_lists = actor_lists.reset_index(name='name')
movie_list = movie_list.merge(actor_lists,on='movie_id', how='left')

# Add list of movies to actors
actors_clean_drop = actors_clean.drop_duplicates(subset=['name','title'])
movie_group = actors_clean_drop.groupby('name',sort = False)
movie_group = movie_group['title'].agg(lambda column: ", ".join(column))
movie_group = movie_group.reset_index(name='title')
movie_group = movie_group.rename(columns={'title':'title list'})
actors_clean = movie_group.merge(actors_clean,on='name', how='left')

# Add list of movies to directors
directors_clean_drop = directors_clean.drop_duplicates(subset=['name','title'])
movie_group_dir = directors_clean_drop.groupby('name',sort = False)
movie_group_dir = movie_group_dir['title'].agg(lambda column: ", ".join(column))
movie_group_dir = movie_group_dir.reset_index(name='title')
movie_group_dir = movie_group_dir.rename(columns={'title':'title list'})
directors_clean = movie_group_dir.merge(directors_clean,on='name', how='left')

# Count each recurrance of an actor's name and create a DF.
actor_movie_count = actors_clean.drop_duplicates(subset=['name','title'])
actor_movie_count = pd.DataFrame({'count': actor_movie_count.groupby(['name','actor_id'],sort=False).size()}).reset_index()

#### Get Keywords for Each Movie

In [None]:
# Get keywords for each movie, create new DF
keywords = pd.DataFrame()
movie_id_keywords = []

for movie_id in id_list:
    movie = requests.get(f'https://api.themoviedb.org/3/movie/{movie_id}/keywords?api_key={api_key}').json()
    for key in movie['keywords']:
        keywords = keywords.append(key, ignore_index=True)
        movie_id_keywords.append(movie_id)

keywords['movie_id'] = movie_id_keywords
keywords['movie_id'] = keywords['movie_id'].astype(int)

keywords = keywords.rename(columns={'name':'keywords'})
keywords = keywords.drop(columns=['id'])

# Add list of keywords to movie_list by movie id.
keyword_group = keywords.groupby('movie_id',sort = False)
keyword_lists = keyword_group['keywords'].agg(lambda column: ", ".join(column))
keyword_lists = keyword_lists.reset_index(name='keywords')

movie_list = movie_list.merge(keyword_lists,on='movie_id', how='left')


#### Get Genres for Each Movie

In [4]:

# Get genre list from TMDB 
genre = requests.get(f'https://api.themoviedb.org/3/genre/movie/list?api_key={api_key}&language=en-US').json()['genres']
genre_df = pd.DataFrame(genre)

genre_df.set_index(genre_df['id'],inplace = True)
genre_df = genre_df.drop('id',1)

genre_title = []

# Get genre names from genre_id column
for ls in movie_list.genre_ids:
    holder = []
    for id in ls:
        holder.append(genre_df.loc[id]['name'])
    genre_title.append(holder)

movie_list['genres'] = genre_title

# Make a DF that lists genres for each movie_id
genre_series = []
movie_series = []

for i in range(len(movie_list)-1):
    try:
        genres = movie_list.loc[i, 'genres']
        genres_list = (','.join(genres))
        for genre in genre_df['name']:
            if genre in genres_list:
                id = movie_list.loc[i, 'movie_id']
                movie_series.append(id)
                genre_series.append(genre)
                i=i+1
    except:
        print(f'error at {i}')

movie_genres =  pd.DataFrame({'movie_id':movie_series, 'genre':genre_series})        



error at 622


### Use Geopy to Get Coordinates for Movie Location

In [None]:
# Use geopy to get longitute and latitude of movie location.
geocoder = RateLimiter(Nominatim(user_agent='sean.cary62@gmail.com').geocode, min_delay_seconds=1)
movie_list['Full Location'] = movie_list['Location'].apply(geocoder)

movie_list['latitude'] = movie_list['Full Location'].apply(lambda loc: loc.latitude if loc else None)
movie_list['longitude'] = movie_list['Full Location'].apply(lambda loc: loc.longitude if loc else None)
movie_list.head(10)


Unnamed: 0,Title,Release date,Time Period,Start Time,End Time,Location,genre_ids,movie_id,original_language,overview,...,title,video,vote_average,vote_count,name,keywords,genres,Full Location,latitude,longitude
0,Quest for Fire,1981,"80,000 BC",-80000,-80000,Europe,"[12, 18]",62204,fr,A colossal adventure odyssey that turns back t...,...,Quest for Fire,False,7.1,361.0,"Everett McGill, Ron Perlman, Nicholas Kadi, Ra...","fire, based on novel or book, mammoth, stone a...","[Adventure, Drama]","(أوروبا, (51.0, 10.0))",51.0,10.0
1,The Clan of the Cave Bear,1986,"40,000 - 35,000 BC",-40000,-40000,Europe,"[12, 18]",13853,en,"Natural changes have the clans moving. Iza, me...",...,The Clan of the Cave Bear,False,5.1,87.0,"Daryl Hannah, Pamela Reed, James Remar, Thomas...","stone age, animal attack, tribe, bear, cavemen...","[Adventure, Drama]","(أوروبا, (51.0, 10.0))",51.0,10.0
2,Alpha,2018,"20,000 BC",-20000,-20000,Europe,"[12, 18]",399360,en,"In the prehistoric past, Keda, a young and ine...",...,Alpha,False,6.4,2093.0,"Kodi Smit-McPhee, Jóhannes Haukur Jóhannesson,...","wolf, ice age, human animal relationship, wild...","[Adventure, Drama]","(أوروبا, (51.0, 10.0))",51.0,10.0
3,"10,000 BC",2008,"10,000 BC",-10000,-10000,Europe,"[12, 28, 18, 14]",7840,en,A prehistoric epic that follows a young mammot...,...,"10,000 BC",False,5.4,2389.0,"Steven Strait, Camilla Belle, Cliff Curtis, Na...","hunter, indigenous, lover (female), pyramid, m...","[Adventure, Action, Drama, Fantasy]","(أوروبا, (51.0, 10.0))",51.0,10.0
4,Gods of Egypt,2016,3200 BC,-3200,-3200,Egypt,"[28, 12, 14]",205584,en,A common thief joins a mythical god on a quest...,...,Gods of Egypt,False,5.6,3368.0,"Nikolaj Coster-Waldau, Brenton Thwaites, Gerar...","egypt, underworld, fight, mythology, nile, thi...","[Action, Adventure, Fantasy]","(مصر, (26.2540493, 29.2675469))",26.254049,29.267547
5,The Scorpion King,2002,3200-3000 BC,-3200,-3200,Egypt,"[28, 12, 14]",9334,en,"In ancient Egypt, peasant Mathayus is hired to...",...,The Scorpion King,False,5.5,2503.0,"Dwayne Johnson, Kelly Hu, Michael Clarke Dunca...","egypt, temple, magic, sword fight, battlefield...","[Action, Adventure, Fantasy]","(مصر, (26.2540493, 29.2675469))",26.254049,29.267547
6,The Pharaohs' Woman,1960,3100 BC,-3100,-3100,Egypt,[],330623,en,A beautiful girl and a young physician fall in...,...,The Pharaohs' Woman,False,0.0,0.0,,,[],"(مصر, (26.2540493, 29.2675469))",26.254049,29.267547
7,Cleopatra's Daughter,1960,2589-2566 BC,-2589,-2566,Egypt,"[12, 18, 10749]",257797,it,The beautiful young Sushila is forced into a p...,...,Cleopatra's Daughter,False,5.2,3.0,"Debra Paget, Debra Paget, Ettore Manni, Ettore...","dancing, pharao, ancient egypt, pharaoh, danci...","[Adventure, Drama, Romance]","(مصر, (26.2540493, 29.2675469))",26.254049,29.267547
8,Cleopatra's Daughter,1960,2589-2566 BC,-2589,-2566,Egypt,"[12, 18, 10749]",257797,it,The beautiful young Sushila is forced into a p...,...,Cleopatra's Daughter,False,5.2,3.0,"Debra Paget, Debra Paget, Ettore Manni, Ettore...","dancing, pharao, ancient egypt, pharaoh, danci...","[Adventure, Drama, Romance]","(مصر, (26.2540493, 29.2675469))",26.254049,29.267547
9,Sudan,1945,2558-2532 BC,-2558,-2532,Egypt,"[12, 28, 10749]",137584,en,"A desert pickpocket, his sidekick, and an esca...",...,Sudan,False,6.8,4.0,"Maria Montez, Jon Hall, Turhan Bey, Andy Devin...","pickpocket, ancient egypt, escaped slave","[Adventure, Action, Romance]","(مصر, (26.2540493, 29.2675469))",26.254049,29.267547


In [None]:
# Add csv files to cleaned folder
keywords.to_csv('resources/cleaned/keywords.csv', index=False)
actors_clean.to_csv('resources/cleaned/actors_cleaned.csv', index=False)
directors_clean.to_csv('resources/cleaned/director_cleaned.csv', index=False)
studio.to_csv('resources/cleaned/studio_cleaned.csv', index=False)
actor_movie_count.to_csv('resources/cleaned/actor_movie_count.csv', index=False)
movie_list.to_csv('resources/cleaned/movie_list_cleaned.csv', index=False)
movie_genres.to_csv('resources/cleaned/movie_genres.csv', index=False)