In [12]:
# import dependencies
import requests
import pandas as pd
from config import api_key

In [15]:
wiki_movie_list = pd.read_csv('resources/wiki_movie_list.csv')
wiki_movie_list.head()

Unnamed: 0,Title,Release date,Time Period,Start Time,End Time,Diff,Notes on setting,Location
0,Beatrice Cenci,1909,1577-1599,1577,1599,22,About the life events of�Beatrice Cenci.,Rome
1,A Tale of Two Cities,1911,1755-1792,1755,1792,37,England and�France�prior and during the�French...,France
2,K?chiyama to Nao-zamurai,1916,0,0,0,0,Based on the play of the same name.,Japan
3,Cleopatra,1917,48-30 BC,-48,-30,18,Egypt�and�Rome�� follows her relationships fir...,Egypt
4,Teodora,1921,500-548,500,548,48,Byzantine�empress�Theodora,Istanbul


In [53]:
# Create list of titles to cycle through TMDB API.
title_list = wiki_movie_list.Title
tmdb_movie_list = pd.DataFrame()
new_titles = []

# Transform titles for query
for ind in wiki_movie_list.index:
    try:
        title = wiki_movie_list['Title'][ind]
        year = wiki_movie_list['Release date'][ind]

        # Remove anything in parenthesis and brackets for database query
        start_1 = title.find('(')
        end_1 = title.find(')')+1

        start_2 = title.find('[')
        end_2 = title.find(']')+1
        
        sub_1=title[start_1:end_1]
        sub_2=title[start_2:end_2]

        title = title.replace(sub_1,'')
        title = title.replace(sub_2,'')
        new_titles.append(title)

        # Replace spaces with '+' for query
        title = title.replace(' ','+')

        #Query TMDB data base for movies on wiki list
        movie = pd.DataFrame(requests.get(f'https://api.themoviedb.org/3/search/movie?api_key={api_key}&year={year}&query={title}').json()['results'])

        tmdb_movie_list = tmdb_movie_list.append(movie)
    except:
        print(f'error on index#: {ind}')

    if ind % 100 == 0:
        print(f'movies searched: {ind}')

tmdb_movie_list = tmdb_movie_list.drop_duplicates(subset=['title','release_date'])

tmdb_movie_list['release_year'] = tmdb_movie_list['release_date'].astype(str).str[0:4].astype(int)

tmdb_movie_list.dtypes
movie_list = wiki_movie_list.merge(tmdb_movie_list, how='left', left_on=['Title','Release date'], right_on=['title','release_year'])
movie_list = movie_list.drop(columns=['Diff', 'adult', 'backdrop_path', 'original_title', 'poster_path', 'release_date', 'release_year'])
movie_list.dropna(subset=['id','Start Time'], inplace=True)



movies searched: 0
movies searched: 100
movies searched: 200
movies searched: 300
movies searched: 400
movies searched: 500
movies searched: 600
movies searched: 700
movies searched: 800
movies searched: 900
movies searched: 1000
movies searched: 1100


Unnamed: 0,Title,Release date,Time Period,Start Time,End Time,Notes on setting,Location,genre_ids,id,original_language,overview,popularity,title,video,vote_average,vote_count
1,A Tale of Two Cities,1911,1755-1792,1755,1792,England and�France�prior and during the�French...,France,[],233095.0,en,A condensed silent film version of the Charles...,1.4,A Tale of Two Cities,False,6.0,1.0
3,Cleopatra,1917,48-30 BC,-48,-30,Egypt�and�Rome�� follows her relationships fir...,Egypt,"[18, 36]",39950.0,en,"The story of Cleopatra, the fabulous queen of ...",1.513,Cleopatra,False,5.5,2.0
6,Richard the Lion-Hearted,1923,1190,1190,1190,based on�Sir Walter Scott's�The Talisman�and s...,Levant,"[18, 12, 36]",727288.0,en,Wallace Beery repeats his role of King Richard...,0.627,Richard the Lion-Hearted,False,0.0,0.0
12,The Viking,1928,1000,1000,1000,Leif Ericson�and the expedition to North America,North America,"[28, 12, 36]",164453.0,en,In this historical adventure based on traditio...,1.162,The Viking,False,6.2,6.0
14,Wara Wara,1930,1535,1535,1535,Romance set at the time of the�Spanish conques...,Bolivia,"[18, 10749]",396280.0,ay,"Set in the 16th century, a peaceful Inca commu...",0.6,Wara Wara,False,5.6,8.0


In [55]:
movie_list = movie_list.rename(columns={'id':'movie_id'})
# movie_list['Start Time'].astype(str).astype(int)

movie_list = movie_list.sort_values('Start Time')

# Code from The-Final-Project_F-PALS, Popular_Crew.ipynb, refractoring for this project.
# The code below will make an API call based on the movie list to get the leading actors and directors of each film.

# Create list of movie ids that will be used to make API calls for additional information
id_list = movie_list.movie_id.tolist()

actors = pd.DataFrame()
directors = pd.DataFrame()
id_for_movie = []
director_movie_id = []

for movie_id in id_list:
    movie = requests.get(f'https://api.themoviedb.org/3/movie/{movie_id}/credits?api_key={api_key}&language=en-US').json()

    for actor in movie['cast']:
        actors = actors.append(actor, ignore_index=True)
        id_for_movie.append(movie_id)

    for director in movie['crew']:
        if director['job'] == "Director":
            directors = directors.append(director, ignore_index=True)
            director_movie_id.append(movie_id)

actors["movie_id"] = id_for_movie
directors['movie_id'] = director_movie_id
actors = actors.rename(columns = {"id": "actor_id",'popularity': 'actor_popularity'})
actors = actors[['name','actor_id','gender','character','actor_popularity','movie_id']]
actors_clean = pd.merge(actors,movie_list[['movie_id', 'title']], on='movie_id', how='left')
directors['movie_id'] = director_movie_id
directors = directors.rename(columns={'id': 'director_id', 'popularity': 'director_popularity'})
director_clean = directors[['name','director_id','gender','director_popularity','movie_id']]

# Add list of actors to movie_list
actor_count = actors_clean.name.value_counts()
actors_group = actors_clean.groupby('movie_id',sort = False)
actor_lists = actors_group['name'].agg(lambda column: ", ".join(column))
actor_lists = actor_lists.reset_index(name='name')

movie_list = movie_list.merge(actor_lists,on='movie_id', how='left')

# Add list of movies to actors:
movie_group = actors_clean.groupby('name',sort = False)
movie_group = movie_group['title'].agg(lambda column: ", ".join(column))
movie_group = movie_group.reset_index(name='title')
movie_group = movie_group.rename(columns={'title':'title list'})
actors_clean = movie_group.merge(actors_clean,on='name', how='right')


In [56]:
id_list = movie_list.movie_id.tolist()
studio = pd.DataFrame()
id_for_movie = []

for movie_id in id_list:
    movieDetail = requests.get(f'https://api.themoviedb.org/3/movie/{movie_id}?api_key={api_key}&language=en-US').json()
    for movie in movieDetail['production_companies']:
        studio = studio.append(movie, ignore_index=True)
        id_for_movie.append(movie_id)


studio['movie_id'] = id_for_movie
studio = studio.drop(columns=['logo_path'])
studio = studio.rename(columns={'id': 'studio_id', 'name':'studio_name'})
studio = pd.merge(studio,movie_list[['movie_id', 'title']], on='movie_id', how='left')

studio_group = studio.groupby('studio_name',sort = False)
studio_group = studio_group['title'].agg(lambda column: ", ".join(column))
studio_group = studio_group.reset_index(name='title')
studio_group = studio_group.rename(columns={'title':'title list'})
studio = studio_group.merge(studio,on='studio_name', how='right')

In [57]:
# Add csv files to cleaned folder
actors_clean.to_csv('resources/cleaned/actors_cleaned.csv', index=False)
director_clean.to_csv('resources/cleaned/director_cleaned.csv', index=False)
studio.to_csv('resources/cleaned/studio_cleaned.csv', index=False)

In [58]:
actor_movie_count = pd.DataFrame({'count': actors_clean.groupby(['name','actor_id'],sort=False).size()}).reset_index()
actor_movie_count.to_csv('resources/cleaned/actor_movie_count.csv', index=False)