In [7]:
# import dependencies
import requests
import pandas as pd
from config import api_key
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

In [8]:
wiki_movie_list = pd.read_csv('resources/wiki_movie_list.csv')
wiki_movie_list.head()

Unnamed: 0,Title,Release date,Time Period,Start Time,End Time,Diff,Notes on setting,Location
0,Beatrice Cenci,1909,1577-1599,1577,1599,22,About the life events of�Beatrice Cenci.,Rome
1,A Tale of Two Cities,1911,1755-1792,1755,1792,37,England and�France�prior and during the�French...,France
2,K?chiyama to Nao-zamurai,1916,0,0,0,0,Based on the play of the same name.,Japan
3,Cleopatra,1917,48-30 BC,-48,-30,18,Egypt�and�Rome�� follows her relationships fir...,Egypt
4,Teodora,1921,500-548,500,548,48,Byzantine�empress�Theodora,Istanbul


In [9]:
# Create list of titles to cycle through TMDB API.
title_list = wiki_movie_list.Title
tmdb_movie_list = pd.DataFrame()
new_titles = []

# Transform titles for query
for ind in wiki_movie_list.index:
    try:
        title = wiki_movie_list['Title'][ind]
        year = wiki_movie_list['Release date'][ind]

        # Remove anything in parenthesis and brackets for database query
        start_1 = title.find('(')
        end_1 = title.find(')')+1

        start_2 = title.find('[')
        end_2 = title.find(']')+1
        
        sub_1=title[start_1:end_1]
        sub_2=title[start_2:end_2]

        title = title.replace(sub_1,'')
        title = title.replace(sub_2,'')
        new_titles.append(title)

        # Replace spaces with '+' for query
        title = title.replace(' ','+')

        #Query TMDB data base for movies on wiki list
        movie = pd.DataFrame(requests.get(f'https://api.themoviedb.org/3/search/movie?api_key={api_key}&year={year}&query={title}').json()['results'])

        tmdb_movie_list = tmdb_movie_list.append(movie)
    except:
        print(f'error on index#: {ind}')

    if ind % 100 == 0:
        print(f'movies searched: {ind}')

tmdb_movie_list = tmdb_movie_list.drop_duplicates(subset=['title','release_date'])

tmdb_movie_list['release_year'] = tmdb_movie_list['release_date'].astype(str).str[0:4].astype(int)

movie_list = wiki_movie_list.merge(tmdb_movie_list, how='left', left_on=['Title','Release date'], right_on=['title','release_year'])
movie_list = movie_list.drop(columns=['Diff', 'adult', 'backdrop_path', 'original_title', 'poster_path', 'release_date', 'release_year'])
movie_list.dropna(subset=['id','Start Time'], inplace=True)



movies searched: 0
movies searched: 100
movies searched: 200
movies searched: 300
movies searched: 400
movies searched: 500
movies searched: 600
movies searched: 700
movies searched: 800
movies searched: 900
movies searched: 1000
movies searched: 1100


In [20]:
movie_list = movie_list.rename(columns={'id':'movie_id'})
# movie_list['Start Time'].astype(str).astype(int)

movie_list = movie_list.sort_values('Start Time')

# Code from The-Final-Project_F-PALS, Popular_Crew.ipynb, refractoring for this project.
# The code below will make an API call based on the movie list to get the leading actors and directors of each film.

# Create list of movie ids that will be used to make API calls for additional information
id_list = movie_list.movie_id.tolist()

actors = pd.DataFrame()
directors = pd.DataFrame()
studio = pd.DataFrame()
actor_movie_id = []
director_movie_id = []
studio_movie_id = []

for movie_id in id_list:
    movie = requests.get(f'https://api.themoviedb.org/3/movie/{movie_id}/credits?api_key={api_key}&language=en-US').json()

    for actor in movie['cast']:
        actors = actors.append(actor, ignore_index=True)
        actor_movie_id.append(movie_id)

    for director in movie['crew']:
        if director['job'] == "Director":
            directors = directors.append(director, ignore_index=True)
            director_movie_id.append(movie_id)

for movie_id in id_list:
    movieDetail = requests.get(f'https://api.themoviedb.org/3/movie/{movie_id}?api_key={api_key}&language=en-US').json()
    for movie in movieDetail['production_companies']:
        studio = studio.append(movie, ignore_index=True)
        studio_movie_id.append(movie_id)


actors["movie_id"] = actor_movie_id
directors['movie_id'] = director_movie_id
actors = actors.rename(columns = {"id": "actor_id",'popularity': 'actor_popularity'})
actors = actors[['name','actor_id','gender','character','actor_popularity','movie_id']]
actors_clean = pd.merge(actors,movie_list[['movie_id', 'title']], on='movie_id', how='left')

directors['movie_id'] = director_movie_id
directors = directors.rename(columns={'id': 'director_id', 'popularity': 'director_popularity'})
directors = directors[['name','director_id','gender','director_popularity','movie_id']]
directors_clean = pd.merge(actors,movie_list[['movie_id', 'title']], on='movie_id', how='left')

studio['movie_id'] = studio_movie_id
studio = studio.drop(columns=['logo_path'])
studio = studio.rename(columns={'id': 'studio_id', 'name':'studio_name'})
studio = pd.merge(studio,movie_list[['movie_id', 'title']], on='movie_id', how='left')

studio_group = studio.groupby('studio_name',sort = False)
studio_group = studio_group['title'].agg(lambda column: ", ".join(column))
studio_group = studio_group.reset_index(name='title')
studio_group = studio_group.rename(columns={'title':'title list'})
studio = studio_group.merge(studio,on='studio_name', how='right')

# Add list of actors to movie_list:
# actor_count = actors_clean.name.value_counts()
actors_group = actors_clean.groupby('movie_id',sort = False)
actor_lists = actors_group['name'].agg(lambda column: ", ".join(column))
actor_lists = actor_lists.reset_index(name='name')

movie_list = movie_list.merge(actor_lists,on='movie_id', how='left')

# Add list of movies to actors:
movie_group = actors_clean.groupby('name',sort = False)
movie_group = movie_group['title'].agg(lambda column: ", ".join(column))
movie_group = movie_group.reset_index(name='title')
movie_group = movie_group.rename(columns={'title':'title list'})
actors_clean = movie_group.merge(actors_clean,on='name', how='right')


In [11]:
id_list = movie_list.movie_id.tolist()
studio = pd.DataFrame()
studio_movie_id = []

for movie_id in id_list:
    movieDetail = requests.get(f'https://api.themoviedb.org/3/movie/{movie_id}?api_key={api_key}&language=en-US').json()
    for movie in movieDetail['production_companies']:
        studio = studio.append(movie, ignore_index=True)
        studio_movie_id.append(movie_id)


studio['movie_id'] = studio_movie_id
studio = studio.drop(columns=['logo_path'])
studio = studio.rename(columns={'id': 'studio_id', 'name':'studio_name'})
studio = pd.merge(studio,movie_list[['movie_id', 'title']], on='movie_id', how='left')

studio_group = studio.groupby('studio_name',sort = False)
studio_group = studio_group['title'].agg(lambda column: ", ".join(column))
studio_group = studio_group.reset_index(name='title')
studio_group = studio_group.rename(columns={'title':'title list'})
studio = studio_group.merge(studio,on='studio_name', how='right')

In [12]:
# Count each recurrance of an actor's name and create a 
actor_movie_count = pd.DataFrame({'count': actors_clean.groupby(['name','actor_id'],sort=False).size()}).reset_index()


In [25]:
# Get keywords for each movie.
keywords = pd.DataFrame()
movie_id_keywords = []

for movie_id in id_list:
    movie = requests.get(f'https://api.themoviedb.org/3/movie/{movie_id}/keywords?api_key={api_key}').json()
    for key in movie['keywords']:
        keywords = keywords.append(key, ignore_index=True)
        movie_id_keywords.append(movie_id)

keywords['movie_id'] = movie_id_keywords

keywords = keywords.drop(columns=['id'])

keyword_group = keywords.groupby('movie_id',sort = False)
keyword_lists = keyword_group['keywords'].agg(lambda column: ", ".join(column))
keyword_lists = keyword_lists.reset_index(name='keywords')

movie_list = movie_list.merge(keyword_lists,on='movie_id', how='left')

movie_list.head()



Unnamed: 0,Title,Release date,Time Period,Start Time,End Time,Notes on setting,Location,genre_ids,movie_id,original_language,...,title,video,vote_average,vote_count,name_x,Full Location,latitude,longitude,name_y,name
0,Quest for Fire,1981,"80,000 BC",-80000,-80000,"The story is set in Paleolithic Europe, with i...",Europe,"[12, 18]",62204.0,fr,...,Quest for Fire,False,7.1,358.0,"Everett McGill, Ron Perlman, Nicholas Kadi, Ra...","(أوروبا, (51.0, 10.0))",51.0,10.0,"Everett McGill, Ron Perlman, Nicholas Kadi, Ra...","fire, based on novel or book, mammoth, stone a..."
1,The Clan of the Cave Bear,1986,"40,000 - 35,000 BC",-40000,-35000,In times of�Neanderthal extinction,Europe,"[12, 18]",13853.0,en,...,The Clan of the Cave Bear,False,5.1,87.0,"Daryl Hannah, Pamela Reed, James Remar, Thomas...","(أوروبا, (51.0, 10.0))",51.0,10.0,"Daryl Hannah, Pamela Reed, James Remar, Thomas...","stone age, animal attack, tribe, bear, cavemen..."
2,Conan the Destroyer,1984,"32,000 - 10,000 BC",-32000,-10000,,Middle East,"[12, 14, 28]",9610.0,en,...,Conan the Destroyer,False,6.1,1115.0,"Arnold Schwarzenegger, Grace Jones, Wilt Chamb...","(Middle East, Baltimore, Maryland, United Stat...",39.301416,-76.588848,"Arnold Schwarzenegger, Grace Jones, Wilt Chamb...","gladiator, swordplay, fight, sword, magic, war..."
3,Conan the Barbarian,1982,"32,000 - 10,000 BC",-32000,-10000,"Occurs in the pseudo-historical ""Hyborian Age""...",Middle East,"[12, 14, 28]",9387.0,en,...,Conan the Barbarian,False,6.8,1835.0,"Arnold Schwarzenegger, James Earl Jones, Max v...","(Middle East, Baltimore, Maryland, United Stat...",39.301416,-76.588848,"Arnold Schwarzenegger, James Earl Jones, Max v...","gladiator, repayment, fight, mythology, magic,..."
4,Alpha,2018,"20,000 BC",-20000,-20000,Dog domestication,Europe,"[12, 18]",399360.0,en,...,Alpha,False,6.4,2082.0,"Kodi Smit-McPhee, Jóhannes Haukur Jóhannesson,...","(أوروبا, (51.0, 10.0))",51.0,10.0,"Kodi Smit-McPhee, Jóhannes Haukur Jóhannesson,...","wolf, ice age, human animal relationship, wild..."


In [14]:
# Use geopy to get longitute and latitude of movie location.
geocoder = RateLimiter(Nominatim(user_agent='sean.cary62@gmail.com').geocode, min_delay_seconds=1)
movie_list['Full Location'] = movie_list['Location'].apply(geocoder)

movie_list['latitude'] = movie_list['Full Location'].apply(lambda loc: loc.latitude if loc else None)
movie_list['longitude'] = movie_list['Full Location'].apply(lambda loc: loc.longitude if loc else None)
movie_list.head(10)


Unnamed: 0,Title,Release date,Time Period,Start Time,End Time,Notes on setting,Location,genre_ids,movie_id,original_language,overview,popularity,title,video,vote_average,vote_count,name,Full Location,latitude,longitude
0,Quest for Fire,1981,"80,000 BC",-80000,-80000,"The story is set in Paleolithic Europe, with i...",Europe,"[12, 18]",62204.0,fr,A colossal adventure odyssey that turns back t...,11.189,Quest for Fire,False,7.1,358.0,"Everett McGill, Ron Perlman, Nicholas Kadi, Ra...","(أوروبا, (51.0, 10.0))",51.0,10.0
1,The Clan of the Cave Bear,1986,"40,000 - 35,000 BC",-40000,-35000,In times of�Neanderthal extinction,Europe,"[12, 18]",13853.0,en,"Natural changes have the clans moving. Iza, me...",7.461,The Clan of the Cave Bear,False,5.1,87.0,"Daryl Hannah, Pamela Reed, James Remar, Thomas...","(أوروبا, (51.0, 10.0))",51.0,10.0
2,Conan the Destroyer,1984,"32,000 - 10,000 BC",-32000,-10000,,Middle East,"[12, 14, 28]",9610.0,en,Conan is commissioned by the evil queen Tarami...,21.004,Conan the Destroyer,False,6.1,1115.0,"Arnold Schwarzenegger, Grace Jones, Wilt Chamb...","(Middle East, Baltimore, Maryland, United Stat...",39.301416,-76.588848
3,Conan the Barbarian,1982,"32,000 - 10,000 BC",-32000,-10000,"Occurs in the pseudo-historical ""Hyborian Age""...",Middle East,"[12, 14, 28]",9387.0,en,A film adaptation of the classic sword and sor...,28.174,Conan the Barbarian,False,6.8,1835.0,"Arnold Schwarzenegger, James Earl Jones, Max v...","(Middle East, Baltimore, Maryland, United Stat...",39.301416,-76.588848
4,Alpha,2018,"20,000 BC",-20000,-20000,Dog domestication,Europe,"[12, 18]",399360.0,en,"In the prehistoric past, Keda, a young and ine...",44.48,Alpha,False,6.4,2082.0,"Kodi Smit-McPhee, Jóhannes Haukur Jóhannesson,...","(أوروبا, (51.0, 10.0))",51.0,10.0
5,"10,000 BC",2008,"10,000 BC",-10000,-10000,Set in the prehistoric era (12 000 years ago) ...,Europe,"[12, 28, 18, 14]",7840.0,en,A prehistoric epic that follows a young mammot...,26.677,"10,000 BC",False,5.4,2373.0,"Steven Strait, Camilla Belle, Cliff Curtis, Na...","(أوروبا, (51.0, 10.0))",51.0,10.0
6,The Bible: In the Beginning...,1966,approx. 3761 BCE - 1644 BCE (according to 00e ...,-3761,-1644,It recounts the first 22 chapters of the bibli...,Egypt,"[18, 10751]",2525.0,en,Extravagant production of the first part of th...,26.917,The Bible: In the Beginning...,False,7.2,173.0,"Michael Parks, Ulla Bergryd, Richard Harris, F...","(مصر, (26.2540493, 29.2675469))",26.254049,29.267547
7,Gods of Egypt,2016,3200 BC,-3200,-3200,A fantasy action film based on the ancient Egy...,Egypt,"[28, 12, 14]",205584.0,en,A common thief joins a mythical god on a quest...,48.57,Gods of Egypt,False,5.6,3350.0,"Nikolaj Coster-Waldau, Brenton Thwaites, Gerar...","(مصر, (26.2540493, 29.2675469))",26.254049,29.267547
8,The Scorpion King,2002,3200-3000 BC,-3200,-3000,A fantasy action film based on historical king...,Egypt,"[28, 12, 14]",9334.0,en,"In ancient Egypt, peasant Mathayus is hired to...",31.102,The Scorpion King,False,5.5,2485.0,"Dwayne Johnson, Kelly Hu, Michael Clarke Dunca...","(مصر, (26.2540493, 29.2675469))",26.254049,29.267547
9,The Pharaohs' Woman,1960,3100 BC,-3100,-3100,A beautiful girl and a young physician fall in...,Egypt,[],330623.0,en,A beautiful girl and a young physician fall in...,1.341,The Pharaohs' Woman,False,0.0,0.0,,"(مصر, (26.2540493, 29.2675469))",26.254049,29.267547


In [15]:
# Add csv files to cleaned folder
actors_clean.to_csv('resources/cleaned/actors_cleaned.csv', index=False)
directors_clean.to_csv('resources/cleaned/director_cleaned.csv', index=False)
studio.to_csv('resources/cleaned/studio_cleaned.csv', index=False)
actor_movie_count.to_csv('resources/cleaned/actor_movie_count.csv', index=False)

PermissionError: [Errno 13] Permission denied: 'resources/cleaned/actors_cleaned.csv'