In [6]:
# import dependencies
import requests
import pandas as pd
import numpy as np
from config import api_key
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

In [7]:
# Read in movies from Wikipedia
wiki_movie_list = pd.read_csv('resources/wiki_movie_list.csv')
wiki_movie_list.head()

wiki_movie_list.drop(columns=['Diff', 'Notes on setting'], inplace=True)

### Get Additional Data from TMDB
TMDB is a community built Movie and TV Database.

In [8]:
# Create list of titles to cycle through TMDB API.
title_list = wiki_movie_list.Title
tmdb_movie_list = pd.DataFrame()
new_titles = []

# Transform titles for query

for i in wiki_movie_list.index:
#for i in range(0,30):
    try:
        title = wiki_movie_list['Title'][i]
        year = wiki_movie_list['Release date'][i]

        # Remove anything in parenthesis and brackets
        start_1 = title.find('(')
        end_1 = title.find(')')+1

        start_2 = title.find('[')
        end_2 = title.find(']')+1
        
        sub_1=title[start_1:end_1]
        sub_2=title[start_2:end_2]

        title = title.replace(sub_1,'')
        title = title.replace(sub_2,'')
        new_titles.append(title)

        
       # Replace spaces with '+'
        title = title.replace(' ','+')

        # Query TMDB data base for movies on wiki list
        movie = pd.DataFrame(requests.get(f'https://api.themoviedb.org/3/search/movie?api_key={api_key}&year={year}&query={title}').json()['results'])

        tmdb_movie_list = tmdb_movie_list.append(movie)
    except:
        print(f'error on index#: {i}')
    # Track Progress
    if i % 100 == 0:
        print(f'movies searched: {i}')

# Clean new DF
# tmdb_movie_list = tmdb_movie_list.drop_duplicates(subset=['title','release_date'])
tmdb_movie_list['release_year'] = tmdb_movie_list['release_date'].astype(str).str[0:4].astype(int)

# Combine Wiki DF with TMDB DF
movie_list = wiki_movie_list.merge(tmdb_movie_list, how='left', left_on=['Title','Release date'], right_on=['title','release_year'])
movie_list.drop(columns=['adult', 'backdrop_path', 'original_title', 'poster_path', 'release_date', 'release_year'], inplace=True)
movie_list = movie_list.loc[~(movie_list['Start Time'] == 0)]
movie_list.dropna(subset=['id'], inplace=True)
movie_list.reset_index(drop=True, inplace= True)
movie_list.rename(columns={'id':'movie_id'}, inplace=True)
movie_list.sort_values('Release date', inplace=True)
movie_list.sort_values('Start Time', inplace=True)
movie_list['movie_id'] = movie_list.movie_id.astype(int)
movie_list.head()


movies searched: 0
movies searched: 100
movies searched: 200
movies searched: 300
movies searched: 400
movies searched: 500
movies searched: 600
movies searched: 700
movies searched: 800
movies searched: 900
movies searched: 1000
movies searched: 1100


Unnamed: 0,Title,Release date,Time Period,Start Time,End Time,Location,genre_ids,movie_id,original_language,overview,popularity,title,video,vote_average,vote_count
450,Quest for Fire,1981,"80,000 BC",-80000,-80000,Europe,"[12, 18]",62204,fr,A colossal adventure odyssey that turns back t...,9.704,Quest for Fire,False,7.1,363.0
0,The Clan of the Cave Bear,1986,"40,000 - 35,000 BC",-40000,-40000,Europe,"[12, 18]",13853,en,"Natural changes have the clans moving. Iza, me...",7.897,The Clan of the Cave Bear,False,5.1,88.0
618,Alpha,2018,"20,000 BC",-20000,-20000,Europe,"[12, 18]",399360,en,"In the prehistoric past, Keda, a young and ine...",37.783,Alpha,False,6.4,2105.0
564,"10,000 BC",2008,"10,000 BC",-10000,-10000,Europe,"[12, 28, 18, 14]",7840,en,A prehistoric epic that follows a young mammot...,25.276,"10,000 BC",False,5.4,2396.0
607,Gods of Egypt,2016,3200 BC,-3200,-3200,Egypt,"[28, 12, 14]",205584,en,A common thief joins a mythical god on a quest...,51.141,Gods of Egypt,False,5.6,3378.0


In [9]:
# Get genre list from TMDB 
genre = requests.get(f'https://api.themoviedb.org/3/genre/movie/list?api_key={api_key}&language=en-US').json()['genres']
genre_df = pd.DataFrame(genre)

genre_df.set_index(genre_df['id'],inplace = True)
genre_df = genre_df.drop('id',1)

genre_title = []

# Get genre names from genre_id column
for ls in movie_list.genre_ids:
    holder = []
    for id in ls:
        holder.append(genre_df.loc[id]['name'])
    genre_title.append(holder)

movie_list['genres'] = genre_title

# Make a DF that lists genres for each movie_id
genre_series = []
movie_series = []

for i in range(len(movie_list)-1):
    try:
        genres = movie_list.loc[i, 'genres']
        genres_list = (','.join(genres))
        for genre in genre_df['name']:
            if genre in genres_list:
                id = movie_list.loc[i, 'movie_id']
                movie_series.append(id)
                genre_series.append(genre)
                i=i+1
    except:
        print(f'error at {i}')

movie_genres =  pd.DataFrame({'movie_id': movie_series, 'genre': genre_series})

error at 622


In [10]:
movie_list.to_csv('resources/cleaned/tmdb_movie_list.csv', index=False)
movie_genres.to_csv('resources/cleaned/movie_genres.csv', index=False)