# Import Starter Movie List

In [203]:
# import dependencies
import requests
import pandas as pd
from config import api_key
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

In [204]:
# Read in movies from Wikipedia
wiki_movie_list = pd.read_csv('resources/wiki_movie_list.csv')
wiki_movie_list.head()

wiki_movie_list.drop(columns=['Time Period', 'Diff', 'Notes on setting'])

Unnamed: 0,Title,Release date,Time Period,Start Time,End Time,Diff,Notes on setting,Location
0,Beatrice Cenci,1909,1577-1599,1577,1599,22,About the life events of�Beatrice Cenci.,Rome
1,A Tale of Two Cities,1911,1755-1792,1755,1792,37,England and�France�prior and during the�French...,France
2,K?chiyama to Nao-zamurai,1916,0,0,0,0,Based on the play of the same name.,Japan
3,Cleopatra,1917,48-30 BC,-48,-30,18,Egypt�and�Rome�� follows her relationships fir...,Egypt
4,Teodora,1921,500-548,500,548,48,Byzantine�empress�Theodora,Istanbul


### Get Additional Data from TMDB
TMDB is a community built Movie and TV Database.

In [205]:
# Create list of titles to cycle through TMDB API.
title_list = wiki_movie_list.Title
tmdb_movie_list = pd.DataFrame()
new_titles = []

# Transform titles for query
for ind in wiki_movie_list.index:
    try:
        title = wiki_movie_list['Title'][ind]
        year = wiki_movie_list['Release date'][ind]

        # Remove anything in parenthesis and brackets
        start_1 = title.find('(')
        end_1 = title.find(')')+1

        start_2 = title.find('[')
        end_2 = title.find(']')+1
        
        sub_1=title[start_1:end_1]
        sub_2=title[start_2:end_2]

        title = title.replace(sub_1,'')
        title = title.replace(sub_2,'')
        new_titles.append(title)

        # Replace spaces with '+'
        title = title.replace(' ','+')

        #Query TMDB data base for movies on wiki list
        movie = pd.DataFrame(requests.get(f'https://api.themoviedb.org/3/search/movie?api_key={api_key}&year={year}&query={title}').json()['results'])

        tmdb_movie_list = tmdb_movie_list.append(movie)
    except:
        print(f'error on index#: {ind}')
    # Track Progress
    if ind % 100 == 0:
        print(f'movies searched: {ind}')

# Clean new DF
tmdb_movie_list = tmdb_movie_list.drop_duplicates(subset=['title','release_date'])
tmdb_movie_list['release_year'] = tmdb_movie_list['release_date'].astype(str).str[0:4].astype(int)

# Combine Wiki DF with TMDB DF
movie_list = wiki_movie_list.merge(tmdb_movie_list, how='left', left_on=['Title','Release date'], right_on=['title','release_year'])
movie_list.drop(columns=['Diff', 'adult', 'backdrop_path', 'original_title', 'poster_path', 'release_date', 'release_year'], inplace=True)
movie_list.dropna(subset=['id','Start Time'], inplace=True)
movie_list.reset_index(drop=True, inplace= True)
movie_list.rename(columns={'id':'movie_id'}, inplace=True)
movie_list.sort_values('Start Time', inplace=True)
movie_list['movie_id'] = movie_list.movie_id.astype(int)


movies searched: 0
movies searched: 100
movies searched: 200
movies searched: 300
movies searched: 400
movies searched: 500
movies searched: 600
movies searched: 700
movies searched: 800
movies searched: 900
movies searched: 1000
movies searched: 1100


### Create DFs for Actors, Directors, and Studios

In [207]:
# Code from The-Final-Project_F-PALS was refractored for this project.
# The code below will make an API call based on the movie list to get the leading actors and directors of each film.

# Create list of movie ids that will be used to make API calls for additional information
id_list = movie_list.movie_id.tolist()

# Create blank DFs and lists
actors_df = pd.DataFrame()
directors_df = pd.DataFrame()
studio_df = pd.DataFrame()
actor_movie_id = []
director_movie_id = []
studio_movie_id = []

# Make API calls for movie_id to get the actors, directors, and studios for each film
for movie_id in id_list:
    movie_credits = requests.get(f'https://api.themoviedb.org/3/movie/{movie_id}/credits?api_key={api_key}&language=en-US').json()

    for actor in movie_credits['cast']:
        actors_df = actors_df.append(actor, ignore_index=True)
        actor_movie_id.append(movie_id)

    for director in movie_credits['crew']:
        if director['job'] == "Director":
            directors_df = directors_df.append(director, ignore_index=True)
            director_movie_id.append(movie_id)

    movie_studios = requests.get(f'https://api.themoviedb.org/3/movie/{movie_id}?api_key={api_key}&language=en-US').json()
    for studio in movie_studios['production_companies']:
        studio_df = studio_df.append(studio, ignore_index=True)
        studio_movie_id.append(movie_id)

# Clean the new DFs
actors_df["movie_id"] = actor_movie_id
directors_df['movie_id'] = director_movie_id
actors = actors_df.rename(columns = {"id": "actor_id",'popularity': 'actor_popularity'})
actors = actors[['name','actor_id','gender','character','actor_popularity','movie_id']]
actors_clean = pd.merge(actors,movie_list[['movie_id', 'title']], on='movie_id', how='left')

directors_df['movie_id'] = director_movie_id
directors = directors_df.rename(columns={'id': 'director_id', 'popularity': 'director_popularity'})
directors = directors[['name','director_id','gender','director_popularity','movie_id']]
directors_clean = pd.merge(actors,movie_list[['movie_id', 'title']], on='movie_id', how='left')

studio_df['movie_id'] = studio_movie_id
studio = studio_df.drop(columns=['logo_path'])
studio = studio.rename(columns={'id': 'studio_id', 'name':'studio_name'})
studio = pd.merge(studio,movie_list[['movie_id', 'title']], on='movie_id', how='left')

studio_group = studio.groupby('studio_name',sort = False)
studio_group = studio_group['title'].agg(lambda column: ", ".join(column))
studio_group = studio_group.reset_index(name='title')
studio_group = studio_group.rename(columns={'title':'title list'})
studio_clean = studio_group.merge(studio,on='studio_name', how='right')

# Add list of actors to movie_list
actors_group = actors_clean.groupby('movie_id',sort = False)
actor_lists = actors_group['name'].agg(lambda column: ", ".join(column))
actor_lists = actor_lists.reset_index(name='name')

movie_list = movie_list.merge(actor_lists,on='movie_id', how='left')

# Add list of movies to actors
movie_group = actors_clean.groupby('name',sort = False)
movie_group = movie_group['title'].agg(lambda column: ", ".join(column))
movie_group = movie_group.reset_index(name='title')
movie_group = movie_group.rename(columns={'title':'title list'})
actors_clean = movie_group.merge(actors_clean,on='name', how='right')

# Count each recurrance of an actor's name and create a DF.
actor_movie_count = pd.DataFrame({'count': actors_clean.groupby(['name','actor_id'],sort=False).size()}).reset_index()



#### Get Keywords for Each Movie

In [208]:
# Get keywords for each movie, create new DF
keywords = pd.DataFrame()
movie_id_keywords = []

for movie_id in id_list:
    movie = requests.get(f'https://api.themoviedb.org/3/movie/{movie_id}/keywords?api_key={api_key}').json()
    for key in movie['keywords']:
        keywords = keywords.append(key, ignore_index=True)
        movie_id_keywords.append(movie_id)

keywords['movie_id'] = movie_id_keywords
keywords['movie_id'] = keywords['movie_id'].astype(int)

keywords = keywords.rename(columns={'name':'keywords'})
keywords = keywords.drop(columns=['id'])

# Add list of keywords to movie_list by movie id.
keyword_group = keywords.groupby('movie_id',sort = False)
keyword_lists = keyword_group['keywords'].agg(lambda column: ", ".join(column))
keyword_lists = keyword_lists.reset_index(name='keywords')

movie_list = movie_list.merge(keyword_lists,on='movie_id', how='left')


#### Get Genres for Each Movie

In [209]:

# Get genre list from TMDB 
genre = requests.get(f'https://api.themoviedb.org/3/genre/movie/list?api_key={api_key}&language=en-US').json()['genres']
genre_df = pd.DataFrame(genre)

genre_df.set_index(genre_df['id'],inplace = True)
genre_df = genre_df.drop('id',1)

genre_title = []

# Get genre names from genre_id column
for ls in movie_list.genre_ids:
    holder = []
    for id in ls:
        holder.append(genre_df.loc[id]['name'])
    genre_title.append(holder)

movie_list['genres'] = genre_title

# Make a DF that lists genres for each movie_id
genre_series = []
movie_series = []

for i in range(len(movie_list)-1):
    try:
        genres = movie_list.loc[i, 'genres']
        genres_list = (','.join(genres))
        for genre in genre_df['name']:
            if genre in genres_list:
                id = movie_list.loc[i, 'movie_id']
                movie_series.append(id)
                genre_series.append(genre)
                i=i+1
    except:
        print(f'error at {i}')

movie_genres =  pd.DataFrame({'movie_id':movie_series, 'genre':genre_series})        



error at 729
error at 729


### Use Geopy to Get Coordinates for Movie Location

In [210]:
# Use geopy to get longitute and latitude of movie location.
geocoder = RateLimiter(Nominatim(user_agent='sean.cary62@gmail.com').geocode, min_delay_seconds=1)
movie_list['Full Location'] = movie_list['Location'].apply(geocoder)

movie_list['latitude'] = movie_list['Full Location'].apply(lambda loc: loc.latitude if loc else None)
movie_list['longitude'] = movie_list['Full Location'].apply(lambda loc: loc.longitude if loc else None)
movie_list.head(10)


Unnamed: 0,Title,Release date,Time Period,Start Time,End Time,Notes on setting,Location,genre_ids,movie_id,original_language,...,title,video,vote_average,vote_count,name,keywords,genres,Full Location,latitude,longitude
0,Quest for Fire,1981,"80,000 BC",-80000,-80000,"The story is set in Paleolithic Europe, with i...",Europe,"[12, 18]",62204,fr,...,Quest for Fire,False,7.1,359.0,"Everett McGill, Ron Perlman, Nicholas Kadi, Ra...","fire, based on novel or book, mammoth, stone a...","[Adventure, Drama]","(أوروبا, (51.0, 10.0))",51.0,10.0
1,The Clan of the Cave Bear,1986,"40,000 - 35,000 BC",-40000,-35000,In times of�Neanderthal extinction,Europe,"[12, 18]",13853,en,...,The Clan of the Cave Bear,False,5.1,87.0,"Daryl Hannah, Pamela Reed, James Remar, Thomas...","stone age, animal attack, tribe, bear, cavemen...","[Adventure, Drama]","(أوروبا, (51.0, 10.0))",51.0,10.0
2,Conan the Barbarian,1982,"32,000 - 10,000 BC",-32000,-10000,"Occurs in the pseudo-historical ""Hyborian Age""...",Middle East,"[12, 14, 28]",9387,en,...,Conan the Barbarian,False,6.8,1836.0,"Arnold Schwarzenegger, James Earl Jones, Max v...","gladiator, repayment, fight, mythology, magic,...","[Adventure, Fantasy, Action]","(Middle East, Baltimore, Maryland, United Stat...",39.301416,-76.588848
3,Conan the Destroyer,1984,"32,000 - 10,000 BC",-32000,-10000,,Middle East,"[12, 14, 28]",9610,en,...,Conan the Destroyer,False,6.1,1115.0,"Arnold Schwarzenegger, Grace Jones, Wilt Chamb...","gladiator, swordplay, fight, sword, magic, war...","[Adventure, Fantasy, Action]","(Middle East, Baltimore, Maryland, United Stat...",39.301416,-76.588848
4,Alpha,2018,"20,000 BC",-20000,-20000,Dog domestication,Europe,"[12, 18]",399360,en,...,Alpha,False,6.4,2083.0,"Kodi Smit-McPhee, Jóhannes Haukur Jóhannesson,...","wolf, ice age, human animal relationship, wild...","[Adventure, Drama]","(أوروبا, (51.0, 10.0))",51.0,10.0
5,"10,000 BC",2008,"10,000 BC",-10000,-10000,Set in the prehistoric era (12 000 years ago) ...,Europe,"[12, 28, 18, 14]",7840,en,...,"10,000 BC",False,5.4,2374.0,"Steven Strait, Camilla Belle, Cliff Curtis, Na...","hunter, indigenous, lover (female), pyramid, m...","[Adventure, Action, Drama, Fantasy]","(أوروبا, (51.0, 10.0))",51.0,10.0
6,The Bible: In the Beginning...,1966,approx. 3761 BCE - 1644 BCE (according to 00e ...,-3761,-1644,It recounts the first 22 chapters of the bibli...,Egypt,"[18, 10751]",2525,en,...,The Bible: In the Beginning...,False,7.2,173.0,"Michael Parks, Ulla Bergryd, Richard Harris, F...","genesis, bible, tower of babel, epic, flood, o...","[Drama, Family]","(مصر, (26.2540493, 29.2675469))",26.254049,29.267547
7,The Scorpion King,2002,3200-3000 BC,-3200,-3000,A fantasy action film based on historical king...,Egypt,"[28, 12, 14]",9334,en,...,The Scorpion King,False,5.5,2486.0,"Dwayne Johnson, Kelly Hu, Michael Clarke Dunca...","egypt, temple, magic, sword fight, battlefield...","[Action, Adventure, Fantasy]","(مصر, (26.2540493, 29.2675469))",26.254049,29.267547
8,Gods of Egypt,2016,3200 BC,-3200,-3200,A fantasy action film based on the ancient Egy...,Egypt,"[28, 12, 14]",205584,en,...,Gods of Egypt,False,5.6,3355.0,"Nikolaj Coster-Waldau, Brenton Thwaites, Gerar...","egypt, underworld, fight, mythology, nile, thi...","[Action, Adventure, Fantasy]","(مصر, (26.2540493, 29.2675469))",26.254049,29.267547
9,The Pharaohs' Woman,1960,3100 BC,-3100,-3100,A beautiful girl and a young physician fall in...,Egypt,[],330623,en,...,The Pharaohs' Woman,False,0.0,0.0,,,[],"(مصر, (26.2540493, 29.2675469))",26.254049,29.267547


In [211]:
# Add csv files to cleaned folder
keywords.to_csv('resources/cleaned/keywords.csv', index=False)
actors_clean.to_csv('resources/cleaned/actors_cleaned.csv', index=False)
directors_clean.to_csv('resources/cleaned/director_cleaned.csv', index=False)
studio.to_csv('resources/cleaned/studio_cleaned.csv', index=False)
actor_movie_count.to_csv('resources/cleaned/actor_movie_count.csv', index=False)
movie_list.to_csv('resources/cleaned/movie_list_cleaned.csv', index=False)
movie_genres.to_csv('resources/cleaned/movie_genres.csv', index=False)