In [1]:
# Import Dependencies
import pandas as pd
import numpy as np
import requests
from pprint import pprint
import matplotlib.pyplot as plt
import time

# Access key 
from config import api_key

In [2]:
# Define API base url and targetted years of movies data
url1 = "https://api.themoviedb.org/3/discover/movie?"
year_data = [2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018]

In [3]:
# We should call two different url's under this API to pull the full movie data information 
#     1. To Retrieve Movie ID list 
#     2. Using Movie ID list pull the important factors information that defines movie success like(Revenue, Budget..etc.)
# Calling below url to pull movie id information,Check for response and print the response whether the data is available or not
# Retrieving data for 12 years and 10 pages of information for each year. This API gives 20 results per page. 

movies_id_data = []
page_num = 1

for year in year_data:
    
    print("Retrieving Data for : " + str(year) )
    
    for page_num in range(1,11):
        response = requests.get(f"{url1}api_key={api_key}&primary_release_year={year}&page={page_num}")
        print(response.url)
        movies = response.json()
       #movies_data.append(response.json())
        page_num = page_num + 1
               
        for movie in movies['results']:
            movies_id_data.append(movie['id'])
            
# Sleep time to call each year data is 5 secs
    time.sleep(5)


Retrieving Data for : 2007
https://api.themoviedb.org/3/discover/movie?api_key=f63b434c2eae10c869016441dc039f7b&primary_release_year=2007&page=1
https://api.themoviedb.org/3/discover/movie?api_key=f63b434c2eae10c869016441dc039f7b&primary_release_year=2007&page=2
https://api.themoviedb.org/3/discover/movie?api_key=f63b434c2eae10c869016441dc039f7b&primary_release_year=2007&page=3
https://api.themoviedb.org/3/discover/movie?api_key=f63b434c2eae10c869016441dc039f7b&primary_release_year=2007&page=4
https://api.themoviedb.org/3/discover/movie?api_key=f63b434c2eae10c869016441dc039f7b&primary_release_year=2007&page=5
https://api.themoviedb.org/3/discover/movie?api_key=f63b434c2eae10c869016441dc039f7b&primary_release_year=2007&page=6
https://api.themoviedb.org/3/discover/movie?api_key=f63b434c2eae10c869016441dc039f7b&primary_release_year=2007&page=7
https://api.themoviedb.org/3/discover/movie?api_key=f63b434c2eae10c869016441dc039f7b&primary_release_year=2007&page=8
https://api.themoviedb.org/3/

https://api.themoviedb.org/3/discover/movie?api_key=f63b434c2eae10c869016441dc039f7b&primary_release_year=2013&page=9
https://api.themoviedb.org/3/discover/movie?api_key=f63b434c2eae10c869016441dc039f7b&primary_release_year=2013&page=10
Retrieving Data for : 2014
https://api.themoviedb.org/3/discover/movie?api_key=f63b434c2eae10c869016441dc039f7b&primary_release_year=2014&page=1
https://api.themoviedb.org/3/discover/movie?api_key=f63b434c2eae10c869016441dc039f7b&primary_release_year=2014&page=2
https://api.themoviedb.org/3/discover/movie?api_key=f63b434c2eae10c869016441dc039f7b&primary_release_year=2014&page=3
https://api.themoviedb.org/3/discover/movie?api_key=f63b434c2eae10c869016441dc039f7b&primary_release_year=2014&page=4
https://api.themoviedb.org/3/discover/movie?api_key=f63b434c2eae10c869016441dc039f7b&primary_release_year=2014&page=5
https://api.themoviedb.org/3/discover/movie?api_key=f63b434c2eae10c869016441dc039f7b&primary_release_year=2014&page=6
https://api.themoviedb.org/3

In [4]:
#Total movie id list pulled from API
print(len(movies_id_data))

2400


In [5]:
# Using movie id list, retrieving full movie details from the TMDb API
url2 = "https://api.themoviedb.org/3/movie/"
movies_details = []

for mve_id in movies_id_data:
    
    response2 = requests.get(f"{url2}{mve_id}?api_key={api_key}")
    #print(response2.url)
    movies_details.append(response2.json())
        
#pprint(movies_details)

In [6]:
# Pull the required data from the reponse received
movie_id1 = []
movie_title = []
budget = []
revenue = []
release_date = []
genre = []
production_comp = []
overview = []

def write_null(v1): 
         
    if v1 == movie_id1:
        movie_id1.append("NaN")
    if v1 == movie_title:
        movie_title.append("NaN")
    if v1 == budget:
        budget.append("NaN")
    if v1 == revenue:
        revenue.append("NaN")
    if v1 == genre:
        genre.append("NaN")
    if v1 == overview:
        overview.append("NaN")
    if v1 == production_comp:
        production_comp.append("NaN")
    if v1 == release_date:
        release_date.append("NaN")
       


for detail in movies_details:
      
    try:
        movie_id1.append(detail["id"])
    except:
        write_null(movie_id1)
    try:
        movie_title.append(detail["title"])
    except:
        write_null(movie_title)
    try:
        budget.append(detail["budget"])
    except:
        write_null(budget)
    try:
        revenue.append(detail["revenue"])
    except:
        write_null(revenue)
    try:
        genre.append(detail["genres"][0]["name"])
    except:
        write_null(genre)
    try:
        overview.append(detail["overview"])
    except:
        write_null(overview)  
    try:
        production_comp.append(detail["production_companies"][0]["name"])
    except:
        write_null(production_comp)
    try:
        release_date.append(detail["release_date"])
    except:
        write_null(release_date)


In [7]:
# Check all data list have same lenght which will be used before creating dataframe
print(len(movie_id1))
print(len(movie_title))
print(len(budget))
print(len(revenue))
print(len(genre))
print(len(overview))
print(len(release_date))

2400
2400
2400
2400
2400
2400
2400


In [8]:
# Built the data frame 
movie_df_frame = pd.DataFrame({"Movie ID":movie_id1,"Title": movie_title, "Budget":budget,"Overview":overview,
                               "Revenue": revenue, "Genre": genre, "Production Company": production_comp, "Release Date": release_date},
                                 columns = ["Title", "Movie ID","Genre","Production Company","Budget", "Revenue", "Overview", "Release Date"])
movie_df_frame.to_csv("santhosh.csv", index =False)
movie_df_frame.head()


Unnamed: 0,Title,Movie ID,Genre,Production Company,Budget,Revenue,Overview,Release Date
0,Pirates of the Caribbean: At World's End,285,Adventure,Jerry Bruckheimer Films,300000000,961000000,"Captain Barbossa, long believed to be dead, ha...",2007-05-19
1,Harry Potter and the Order of the Phoenix,675,Adventure,Warner Bros. Pictures,150000000,938212738,Returning for his fifth year of study at Hogwa...,2007-06-28
2,Russian Lolita,329103,Romance,Eros Movie,0,0,"The action of a controversial novel ""Lolita"", ...",2007-01-03
3,Spider-Man 3,559,Fantasy,Marvel Enterprises,258000000,890871626,The seemingly invincible Spider-Man goes up ag...,2007-05-01
4,Live Free or Die Hard,1571,Action,Dune Entertainment,110000000,383531464,"John McClane is back and badder than ever, and...",2007-06-20


In [9]:
# Cleaning the data by removing all the Nan values and zeroes on Revenue and Budget.
clean_movie_df = movie_df_frame.loc[(movie_df_frame["Title"] != 'NaN') & (movie_df_frame["Movie ID"] != 'NaN') &  (movie_df_frame["Genre"] != 'NaN') &  (movie_df_frame["Production Company"] != 'NaN') & (movie_df_frame["Budget"] != 'NaN') & (movie_df_frame["Revenue"] != 'NaN') & (movie_df_frame["Overview"] != 'NaN') & (movie_df_frame["Release Date"] != 'NaN')]
clean_movie_df.to_csv("santhosh clean.csv", index = False)
clean_movie_df.head()

Unnamed: 0,Title,Movie ID,Genre,Production Company,Budget,Revenue,Overview,Release Date
0,Pirates of the Caribbean: At World's End,285,Adventure,Jerry Bruckheimer Films,300000000,961000000,"Captain Barbossa, long believed to be dead, ha...",2007-05-19
1,Harry Potter and the Order of the Phoenix,675,Adventure,Warner Bros. Pictures,150000000,938212738,Returning for his fifth year of study at Hogwa...,2007-06-28
2,Russian Lolita,329103,Romance,Eros Movie,0,0,"The action of a controversial novel ""Lolita"", ...",2007-01-03
3,Spider-Man 3,559,Fantasy,Marvel Enterprises,258000000,890871626,The seemingly invincible Spider-Man goes up ag...,2007-05-01
4,Live Free or Die Hard,1571,Action,Dune Entertainment,110000000,383531464,"John McClane is back and badder than ever, and...",2007-06-20


In [10]:
clean_movie_df = clean_movie_df[clean_movie_df.Budget!=0]
clean_movie_df = clean_movie_df[clean_movie_df.Revenue!=0]
clean_movie_df.to_csv("clean movie data.csv", index = False)
clean_movie_df.head()

Unnamed: 0,Title,Movie ID,Genre,Production Company,Budget,Revenue,Overview,Release Date
0,Pirates of the Caribbean: At World's End,285,Adventure,Jerry Bruckheimer Films,300000000,961000000,"Captain Barbossa, long believed to be dead, ha...",2007-05-19
1,Harry Potter and the Order of the Phoenix,675,Adventure,Warner Bros. Pictures,150000000,938212738,Returning for his fifth year of study at Hogwa...,2007-06-28
3,Spider-Man 3,559,Fantasy,Marvel Enterprises,258000000,890871626,The seemingly invincible Spider-Man goes up ag...,2007-05-01
4,Live Free or Die Hard,1571,Action,Dune Entertainment,110000000,383531464,"John McClane is back and badder than ever, and...",2007-06-20
5,28 Weeks Later,1562,Horror,DNA Films,15000000,64238440,"In this chilling sequel to 28 Days Later, the ...",2007-04-26
