### Import Required Libraries and Set Up Environment Variables

In [89]:
# Dependencies
import requests
import time
from dotenv import load_dotenv
import os
import pandas as pd
import json

In [90]:
# Set environment variables from the .env in the local environment
load_dotenv()

nyt_api_key = os.getenv("NYT_API_KEY")
tmdb_api_key = os.getenv("TMDB_API_KEY")
nyt_api_key


'tWshVCIQ1FRLQJ3sWWg1COM9yFl9btEm'

### Access the New York Times API

In [91]:
# Set the base URL
url = "https://api.nytimes.com/svc/search/v2/articlesearch.json?"

# Filter for movie reviews with "love" in the headline
# section_name should be "Movies"
# type_of_material should be "Review"
filter_query = 'section_name:"Movies" AND type_of_material:"Review" AND headline:"love"'

# Use a sort filter, sort by newest
sort = "newest"

# Select the following fields to return:
# headline, web_url, snippet, source, keywords, pub_date, byline, word_count
field_list = "headline,web_url,snippet,source,keywords,pub_date,byline,word_count"

# Search for reviews published between a begin and end date
begin_date = "20130101"
end_date = "20230531"

# Build URL
query_url = (
    f"{url}api-key={nyt_api_key}&begin_date={begin_date}&end_date={end_date}"
    + f'&fq={filter_query}&sort={sort}&fl={field_list}'
)
query_url
#https://api.nytimes.com/svc/movies/v2/reviews/search.json?query=godfather&api-key=yourkey

'https://api.nytimes.com/svc/search/v2/articlesearch.json?api-key=tWshVCIQ1FRLQJ3sWWg1COM9yFl9btEm&begin_date=20130101&end_date=20230531&fq=section_name:"Movies" AND type_of_material:"Review" AND headline:"love"&sort=newest&fl=headline,web_url,snippet,source,keywords,pub_date,byline,word_count'

In [92]:
# Create an empty list to store the reviews
movie_reviews = []



# loop through pages 0-19
for page in range(0, 20): 
    # create query with a page number
    # API results show 10 articles at a time
    page_query_url = f"{query_url}&page={page}"    

    
    # Make a "GET" request and retrieve the JSON
    user_reviews = requests.get(page_query_url).json()
    
    # Add a twelve second interval between queries to stay within API query limits
    time.sleep(12)
    
        # Try and save the reviews to the reviews_list
    try:
        # loop through the reviews["response"]["docs"] and append each review to the list
        for review in user_reviews["response"]["docs"]:
            movie_reviews.append(review)
        # Print the page that was just retrieved
        print(f"Checked page {page}")
    except:
        # Print the page number that had no results then break from the loop
        print(f"No results. Ended at page {page}.")
        break

Checked page 0


KeyboardInterrupt: 

In [None]:
# Preview the first 5 results in JSON format
print(json.dumps(movie_reviews[:5], indent=4))

# Use json.dumps with argument indent=4 to format data


In [None]:
# Convert reviews_list to a Pandas DataFrame using json_normalize()
movie_df = pd.json_normalize(movie_reviews)
movie_df

In [None]:
# Extract the title from the "headline.main" column and
# save it to a new column "title"
# Assuming you have a DataFrame named news_articles_df with a column 'headline.main'
movie_df['title'] = movie_df['headline.main'].apply(lambda x: x.split(" - ")[0])
movie_df


# Title is between unicode characters \u2018 and \u2019. 
# End string should include " Review" to avoid cutting title early


In [None]:
# Extract 'name' and 'value' from items in "keywords" column
def extract_keywords(keyword_list):
    extracted_keywords = ""
    for item in keyword_list:
        # Extract 'name' and 'value'
        keyword = f"{item['name']}: {item['value']};" 
        # Append the keyword item to the extracted_keywords list
        extracted_keywords += keyword
    return extracted_keywords

# Fix the "keywords" column by converting cells from a list to a string


In [None]:
# Create a list from the "title" column using to_list()
title_list = movie_df['title'].to_list()
title_list

# These titles will be used in the query for The Movie Database


### Access The Movie Database API

In [None]:
# Prepare The Movie Database query
url = "https://api.themoviedb.org/3/search/movie?query="
tmdb_key_string = "&api_key=" + tmdb_api_key

In [94]:
# Create an empty list to store the results
titles = []

# Create a request counter to sleep the requests after a multiple

import time

class RequestCounter:
    def __init__(self, max_requests, sleep_time):
        self.max_requests = max_requests
        self.sleep_time = sleep_time
        self.requests_made = 0

    def make_request(self):
        if self.requests_made < self.max_requests:
            print(f"Making request number: {self.requests_made}")
            self.requests_made += 1
            time.sleep(self.sleep_time)
        else:
            print("Maximum number of requests reached.")

# Create a RequestCounter object with max_requests=5 and sleep_time=3
request_counter = RequestCounter(max_requests=5, sleep_time=3)

# Make requests using the RequestCounter object
for _ in range(50):
    request_counter.make_request()
    
# of 50 requests


# Loop through the titles

    # Check if we need to sleep before making a request


    # Add 1 to the request counter

import time

titles = ["Title1", "Title2", "Title3", ...]  # Your list of movie titles

request_counter = 0

# Start a loop to iterate through each title in the titles list
for title in titles:
    # Check if the request counter is a multiple of 50
    if request_counter % 50 == 0:
        time.sleep(1)  # Introduce a 1-second delay
        print("Sleeping for 1 second...")
#Make the request to The Movie Database
    #response = requests.get(query_url)
    response_data = requests.get(query_url)

    response_data = response_data.json()
    response_data
    # Print the JSON results
    print(json.dumps(response_data, indent=4))

    # Make your API request here using the current title
    query_url
    # After making the request, increment the request counter by 1
    request_counter += 1

    # Add your code here to handle the API request for the current title




    # Perform a "GET" request for The Movie Database
    # Send a GET request to the API endpoint
    response_data = requests.get(query_url)

    # Check the status code of the response
    if response_data.status_code == 200:
        # Request was successful
        print('GET request successful')
    # Process the response data as needed
        data = response_data.json()
    else:
    # Request was not successful
        print('GET request failed with status code:', response_data.status_code)
    
#response_data  = requests.get(query_url)

    response_data = response_data.json()

    # Print the JSON results
    print(json.dumps(response_data, indent=4))
    response_data
    
    # Include a try clause to search for the full movie details.
try:
    response_data = requests.get(query_url)
    
    # Check the status code of the response
    if response_data.status_code == 200:
        # Request was successful
        print('GET request successful')
    # Process the response data as needed
        data = response_data.json()
    else:
    # Request was not successful
        print('GET request failed with status code:', response_data.status_code) 
except:
    print('Movie details found for ', {response_data})
    
    
    
    # Use the except clause to print out a statement if a movie
    # is not found.

        # Get movie id
    movie_id = data["results"][0]["id"]
    movie_id

        # Make a request for a the full movie details
    query_url_1 = f"https://api.themoviedb.org/3/movie/{movie_id}?api_key={tmdb_key_string}"


        # Execute "GET" request with url
    response_data = requests.get(query_url_1).json()

        
        # Extract the genre names into a list
    #genre_names = []
    #for x in genre:  
    genre_names = [genre['name'] for genre in response_data]['genres']


    #genre_names

        # Extract the spoken_languages' English name into a list
    spoken_languages = [language["english_name"] for language in response_data['spoken_languages']]

        # Extract the production_countries' name into a list

    production_countries = [country['name'] for country in response_data['production_countries']]
        # Add the relevant data to a dictionary and
        # append it to the tmdb_movies_list list

        
        # Print out the title that was found



Making request number: 0
Making request number: 1
Making request number: 2
Making request number: 3
Making request number: 4
Maximum number of requests reached.
Maximum number of requests reached.
Maximum number of requests reached.
Maximum number of requests reached.
Maximum number of requests reached.
Maximum number of requests reached.
Maximum number of requests reached.
Maximum number of requests reached.
Maximum number of requests reached.
Maximum number of requests reached.
Maximum number of requests reached.
Maximum number of requests reached.
Maximum number of requests reached.
Maximum number of requests reached.
Maximum number of requests reached.
Maximum number of requests reached.
Maximum number of requests reached.
Maximum number of requests reached.
Maximum number of requests reached.
Maximum number of requests reached.
Maximum number of requests reached.
Maximum number of requests reached.
Maximum number of requests reached.
Maximum number of requests reached.
Maximum num

In [100]:
response_data.json()

{'fault': {'faultstring': 'Rate limit quota violation. Quota limit  exceeded. Identifier : 44b1f848-6949-4b0c-a751-88e0bbf4737c',
  'detail': {'errorcode': 'policies.ratelimit.QuotaViolation'}}}

In [None]:
response_data = requests.get(query_url).json()
response_data 

In [None]:
# Preview the first 5 results in JSON format

# Use json.dumps with argument indent=4 to format data

print(json.dumps(movie_reviews[:5], indent=4))


In [None]:
# Convert the results to a DataFrame


### Merge and Clean the Data for Export

In [None]:
# Merge the New York Times reviews and TMDB DataFrames on title
merged_df = pd.merge(tmdb_df, reviews_df, on="title")
merged_df

In [None]:
# Remove list brackets and quotation marks on the columns containing lists
# Create a list of the columns that need fixing
columns_to_fix = ["genre", "spoken_languages", "production_countries"]

# Create a list of characters to remove
remove_charact = ["[","]", "'"]

# Loop through the list of columns to fix

for x in columns_to_fix:
    

    # Convert the column to type 'str'
    merged_df[x] = merged_df[x].astype('str')

    # Loop through characters to remove
    for r in remove_charact:
        merged_df[r] = merged_df[r].str.replace(r, "", regex = False)
        

# Display the fixed DataFrame

merged_df.head()

In [None]:
# Drop "byline.person" column
merged_df = merged_df.drop(columns = 'byline.person')

In [None]:
# Delete duplicate rows and reset index
merged_df_new = merged_df.drop_duplicates().reset_index(drop = True)
merged_df_new.head()

In [None]:
# Export data to CSV without the index
merged_df_new.to_csv("output/collected_data.csv", index=False)