### Import Required Libraries and Set Up Environment Variables

In [19]:
# Dependencies
import requests
import time
from dotenv import load_dotenv
import os
import pandas as pd
import json

In [20]:
# Set environment variables from the .env in the local environment
load_dotenv('api.env')

nyt_api_key = os.getenv("NYT_API_KEY")
tmdb_api_key = os.getenv("TMDB_API_KEY")

### Access the New York Times API

In [22]:
# Set the base URL
url = "https://api.nytimes.com/svc/search/v2/articlesearch.json?"
# Filter for movie reviews with "love" in the headline
# section_name should be "Movies"
# type_of_material should be "Review"
filter_query = 'section_name:"Movies" AND type_of_material:"Review" AND headline:"love"'
# Use a sort filter, sort by newest
sort = "newest"
# Select the following fields to return:
# headline, web_url, snippet, source, keywords, pub_date, byline, word_count
field_list = "headline,web_url,snippet,source,keywords,pub_date,byline,word_count"
# Search for reviews published between a begin and end date
begin_date = "20130101"
end_date = "20230531"
# Build URL
query_url = (
    f"{url}api-key={nyt_api_key}&begin_date={begin_date}&end_date={end_date}&fq={filter_query}&sort={sort}&fl={field_list}")

In [23]:
# Make a "GET" request and retrieve the JSON
requests.get(url)
# response = requests.get(query_url)
# reviews = response.json()
# print(json.dumps(reviews, indent=4))

<Response [401]>

In [None]:
# # Base URL for the New York Times Article Search API
# url = "https://api.nytimes.com/svc/search/v2/articlesearch.json?"

# # Construct the filter query
# filter_query = 'section_name:"Movies" AND type_of_material:"Review" AND headline:"love"'
# sort = "newest"
# field_list = "headline,web_url,snippet,source,keywords,pub_date,byline,word_count"
# begin_date = "20130101"
# end_date = "20230531"





In [None]:
# create a list to store the reviews
reviews_list = ([])  
# Attempt to retrieve 200 reviews by iterating through 20 pages starting at 0 (10 results per page)
for page in range(20):  # NYT API limits results to 10 per page, 20 pages = 200 results
    offset = page * 10
    full_url = f"{url}api-key={nyt_api_key}&q=love&fq={filter_query}&begin_date={begin_date}&end_date={end_date}&sort={sort}&fl={field_list}&page={page}"
    response = requests.get(full_url)
    time.sleep(12)  #  rate limits per documentation

    try:
        reviews = response.json()
        if reviews["response"]["docs"]:
            reviews_list.extend(reviews["response"]["docs"])
            print(f"Page {page + 1} processed, offset {offset}")
        else:
            print(f"No more results found at page {page + 1}, stopping.")
            break
    except Exception as e:
        print(f"An error occurred on page {page + 1} with offset {offset}: {e}")
        break

In [None]:
# # Create an empty list to store the reviews
# reviews_list = []
# # loop through pages 0-19
# total_pages = 5 
# for page_number in range(0, total_pages, 5):
#     offset_value = page_number * 5
#     # query_url = f"{url}api-key={nyt_api_key}&begin_date={begin_date}&end_date={end_date}&fq={filter_query}&sort={sort}&offset={offset_value}"
#     query_url = f"{url}api-key={nyt_api_key}&begin_date={begin_date}&end_date={end_date}&fq={filter_query}&sort={sort}&fl={field_list}"

#     # query_url = (
#     #     f"{url}api-key={nyt_api_key}&byline:{reviews}&sort{sort}&fl={field_list}&page={page_number}"
#     # )
#     # Make a "GET" request and retrieve the JSON
#     response = requests.get(query_url)
#     # Add a twelve second interval between queries to stay within API query limits
#     time.sleep(12)
#     # Try and save the reviews to the reviews_list
#     # loop through the reviews["response"]["docs"] and append each review to the list
#     try:
#         # Loop through the "docs"
#         for doc in reviews["response"]["docs"]:
#             # Save byline.original, headline.main, snippet,
#             # and web_url
#             reviews_list.append(
#             {
#                 "byline": doc["byline"]["original"],
#                 "headline": doc["headline"]["main"],
#                 "snippet": doc["snippet"],
#             }
#         )
#         if response.status_code == 200:  # Check for successful response
#             data = response.json()
#         reviews = data["response"]["docs"]

#         if reviews:  # Check if we have any reviews on this page
#             print(
#                 f"\nReviews found on Page {page_number + 1}:"
#             )  # Page numbers start from 0
#             for review in reviews:
#                 print(f"Headline: {review['headline']}")
#                 print(f"Web URL: {review['web_url']}")
#         else:
#             print(f"No reviews found on Page {page_number + 1}")
# else:
#         print(f"Error fetching page {page_number + 1}: {response.status_code}")
#     #         # Print the page that was just retrieved.
#     #     print(f"Found reviews on the following pages:{page_number}")
#     # # Print the page number that had no results then break from the loop
#     # except:
#     #     print(
#     #     f"No reviews found for the following pages: {page_number}"
#     # )  
        
#     break

    

In [None]:
# Preview the first 5 results in JSON format
first_five = reviews_list[:5]
# Use json.dumps with argument indent=4 to format data
json_data = json.dumps(first_five, indent=4)
print(json_data)

In [None]:
# Convert reviews_list to a Pandas DataFrame using json_normalize()
reviews_df = pd.json_normalize(reviews_list)

# Optionally, preview the first few rows of the DataFrame
#print(reviews_df.head(3))
reviews_df 

In [None]:
# Extract the title from the "headline.main" column and
# save it to a new column "title"
# Title is between unicode characters \u2018 and \u2019.
reviews_df["title"] = reviews_df["headline.main"].apply(
    lambda st: st[st.find("\u2018") + 1 : st.find("\u2019 Review")]
)
def extract_keywords(keywords_list):
    return ", ".join([keyword["value"] for keyword in keywords_list])
reviews_df["keywords"] = reviews_df["keywords"].apply(extract_keywords)

titles = reviews_df["title"].tolist()
# Title is between unicode characters \u2018 and \u2019.
# End string should include " Review" to avoid cutting title early
reviews_df["title"] = reviews_df["headline.main"].apply(
    lambda st: st[st.find("‘") + 1 : st.find("’ Review")]
)

# End string should include " Review" to avoid cutting title early
# lambda st: st[st.find("\u2018") + 1 : st.find("\u2019 Review")]
# Extract the title from the "headline.main" column and save it to a new column "title"


# Define the extract_keywords function
def extract_keywords(keywords_list):
    return ", ".join([keyword["value"] for keyword in keywords_list])


# Apply the extract_keywords function to the "keywords" column
reviews_df["keywords"] = reviews_df["keywords"].apply(extract_keywords)

In [None]:
# Extract 'name' and 'value' from items in "keywords" column
def extract_keywords(keyword_list):
    extracted_keywords = ""
    for item in keyword_list:
        # Extract 'name' and 'value'
        keyword = f"{item['name']}: {item['value']};" 
        # Append the keyword item to the extracted_keywords list
        extracted_keywords += keyword
    return extracted_keywords

# Fix the "keywords" column by converting cells from a list to a string


In [None]:
# Create a list from the "title" column using to_list()
# These titles will be used in the query for The Movie Database


### Access The Movie Database API

In [None]:
# Prepare The Movie Database query
url = "https://api.themoviedb.org/3/search/movie?query="
tmdb_key_string = "&api_key=" + tmdb_api_key


<Response [401]>

In [None]:
# Create an empty list to store the results


# Create a request counter to sleep the requests after a multiple
# of 50 requests


# Loop through the titles

    # Check if we need to sleep before making a request


    # Add 1 to the request counter

    
    # Perform a "GET" request for The Movie Database


    # Include a try clause to search for the full movie details.
    # Use the except clause to print out a statement if a movie
    # is not found.

        # Get movie id


        # Make a request for a the full movie details


        # Execute "GET" request with url

        
        # Extract the genre names into a list


        # Extract the spoken_languages' English name into a list


        # Extract the production_countries' name into a list


        # Add the relevant data to a dictionary and
        # append it to the tmdb_movies_list list

        
        # Print out the title that was found



In [None]:
# Preview the first 5 results in JSON format
# Use json.dumps with argument indent=4 to format data


In [None]:
# Convert the results to a DataFrame


### Merge and Clean the Data for Export

In [None]:
# Merge the New York Times reviews and TMDB DataFrames on title


In [None]:
# Remove list brackets and quotation marks on the columns containing lists
# Create a list of the columns that need fixing


# Create a list of characters to remove


# Loop through the list of columns to fix

    # Convert the column to type 'str'


    # Loop through characters to remove


# Display the fixed DataFrame


In [None]:
# Drop "byline.person" column


In [None]:
# Delete duplicate rows and reset index


In [None]:
# Export data to CSV without the index
