### Import Required Libraries and Set Up Environment Variables

In [2]:
# Import Dependencies for the script
import requests
import time
from dotenv import load_dotenv
import os
import pandas as pd
import json

In [3]:
# Set environment variables from the .env in the local environment
load_dotenv()
nyt_api_key = os.getenv("NYT_API_KEY")
tmdb_api_key = os.getenv("TMDB_API_KEY")

### Access the New York Times API
#### Build url and Filter


In [6]:
# # Set the base URL
# url = "https://api.nytimes.com/svc/search/v2/articlesearch.json?"

# # Filter for movie reviews with "love" in the headline
# # section_name should be "Movies"
# # type_of_material should be "Review"
# filter_query = 'section_name:"Movies" AND type_of_material:"Review" AND headline:"love"'

# # Use a sort filter, sort by newest
# sort = "newest"

# # Select the following fields to return:
# # headline, web_url, snippet, source, keywords, pub_date, byline, word_count
# field_list = "headline,web_url,snippet,source,keywords,pub_date,byline,word_count"

# # Search for reviews published between a begin and end date
# begin_date = "20130101"
# end_date = "20230531"

# # Build URL
# build_url = f"{url}api-key={nyt_api_key}&q={filter_query}&sort={sort}&begin_date={begin_date}&end_date={end_date}&fl={field_list}"  
# Set the base URL
url = "https://api.nytimes.com/svc/search/v2/articlesearch.json?"

# Filter for movie reviews with "love" in the headline
# section_name should be "Movies"
# type_of_material should be "Review"
filter_query = 'section_name:"Movies" AND type_of_material:"Review" AND headline:"love"'

# Use a sort filter, sort by newest
sort = "newest"

# Select the following fields to return:
# headline, web_url, snippet, source, keywords, pub_date, byline, word_count
field_list = "headline,web_url,snippet,source,keywords,pub_date,byline,word_count"

# Search for reviews published between a begin and end date
begin_date = "20130101"
end_date = "20230531"

# Build URL
query_url = (
    f"{url}api-key={nyt_api_key}&begin_date={begin_date}&end_date={end_date}"
    + f'&fq={filter_query}&sort={sort}&fl={field_list}'
)

In [7]:
response = requests.get(query_url)
response

<Response [200]>

In [8]:
field_list = response.json()
field_list

{'status': 'OK',
 'copyright': 'Copyright (c) 2024 The New York Times Company. All Rights Reserved.',
 'response': {'docs': [{'web_url': 'https://www.nytimes.com/2023/05/25/movies/the-attachment-diaries-review.html',
    'snippet': 'A gynecologist and her patient form a horrifyingly twisted connection in this batty, bloody Argentine melodrama.',
    'source': 'The New York Times',
    'headline': {'main': '‘The Attachment Diaries’ Review: Love, Sick',
     'kicker': None,
     'content_kicker': None,
     'print_headline': 'The Attachment Diaries',
     'name': None,
     'seo': None,
     'sub': None},
    'keywords': [{'name': 'subject',
      'value': 'Movies',
      'rank': 1,
      'major': 'N'},
     {'name': 'creative_works',
      'value': 'The Attachment Diaries (Movie)',
      'rank': 2,
      'major': 'N'},
     {'name': 'persons',
      'value': 'Diment, Valentin Javier',
      'rank': 3,
      'major': 'N'}],
    'pub_date': '2023-05-25T11:00:03+0000',
    'byline': {'orig

In [9]:
reviews = response.json()
reviews

{'status': 'OK',
 'copyright': 'Copyright (c) 2024 The New York Times Company. All Rights Reserved.',
 'response': {'docs': [{'web_url': 'https://www.nytimes.com/2023/05/25/movies/the-attachment-diaries-review.html',
    'snippet': 'A gynecologist and her patient form a horrifyingly twisted connection in this batty, bloody Argentine melodrama.',
    'source': 'The New York Times',
    'headline': {'main': '‘The Attachment Diaries’ Review: Love, Sick',
     'kicker': None,
     'content_kicker': None,
     'print_headline': 'The Attachment Diaries',
     'name': None,
     'seo': None,
     'sub': None},
    'keywords': [{'name': 'subject',
      'value': 'Movies',
      'rank': 1,
      'major': 'N'},
     {'name': 'creative_works',
      'value': 'The Attachment Diaries (Movie)',
      'rank': 2,
      'major': 'N'},
     {'name': 'persons',
      'value': 'Diment, Valentin Javier',
      'rank': 3,
      'major': 'N'}],
    'pub_date': '2023-05-25T11:00:03+0000',
    'byline': {'orig

In [10]:
# Create an empty list to store the reviews
reviews_list = []
# loop through pages 0-19
for page_number in range(2):
    # Show the page_number when the page is queried 
    print("Processing page", page_number)
    # create query with a page number
    # API results show 10 articles at a time
    articles_per_page = 10  # Number of articles per page
    api_key = nyt_api_key # Your API key from the NY Times API
# Construct the query URL with placeholders for page number and limit
    #query = f"https://api.nytimes.com/svc/search/v2/articlesearch.json?api-key={api_key}&q=love&fq=section_name&page={page_number}&limit={articles_per_page}"
    query_page = f"{query_url}&page={page_number}"
    # Make a "GET" request and retrieve the JSON
    # url = "https://api.nytimes.com/svc/search/v2/articlesearch.json?"
    # Make a "GET" request and retrieve the JSON response
    response = requests.get(query_page)
    response_json = response.json()
    # Check if the request was successful (status code 200)
    try:  
    # Check if the request was successful (status code 200)
        if response.status_code == 200:
            # Parse the JSON response
            json_data = response.json()
            # Now you can work with the JSON data
            # Try and save the reviews to the reviews_list
            for review in json_data["response"]["docs"]:
                reviews_list.append(review)
            # Print the page that was just retrieved
            print("Retrieved page:", page_number)
            # Print the page number that had no results then break from the loop
            # if not json_data["response"]["docs"]:
            #     print("No results for page number:", page_number)
        else:
                # If the request was not successful, print an error message
            print("Error:", response.status_code)
        # Add a twelve-second interval between queries to stay within API query limits
        time.sleep(12)
    except:
        print("Error in retrieving page", page_number)
        

Processing page 0
Retrieved page: 0
Processing page 1
Retrieved page: 1


In [18]:
writers_v1 = reviews.df('blyline.original').drop.duplicates()
print(writers_v1)

AttributeError: 'dict' object has no attribute 'df'

In [None]:
# Preview the first 5 results in JSON format
# Use json.dumps with argument indent=4 to format data
print(json.dumps(reviews_list[:5], indent=4))


[
    {
        "web_url": "https://www.nytimes.com/2023/05/25/movies/the-attachment-diaries-review.html",
        "snippet": "A gynecologist and her patient form a horrifyingly twisted connection in this batty, bloody Argentine melodrama.",
        "source": "The New York Times",
        "headline": {
            "main": "\u2018The Attachment Diaries\u2019 Review: Love, Sick",
            "kicker": null,
            "content_kicker": null,
            "print_headline": "The Attachment Diaries",
            "name": null,
            "seo": null,
            "sub": null
        },
        "keywords": [
            {
                "name": "subject",
                "value": "Movies",
                "rank": 1,
                "major": "N"
            },
            {
                "name": "creative_works",
                "value": "The Attachment Diaries (Movie)",
                "rank": 2,
                "major": "N"
            },
            {
                "name": "persons",
 

In [None]:
# Convert reviews_list to a Pandas DataFrame using json_normalize()
df_reviews_list= pd.json_normalize(reviews_list)
df_reviews_list.head()


Unnamed: 0,web_url,snippet,source,keywords,pub_date,word_count,headline.main,headline.kicker,headline.content_kicker,headline.print_headline,headline.name,headline.seo,headline.sub,byline.original,byline.person,byline.organization
0,https://www.nytimes.com/2023/05/25/movies/the-...,A gynecologist and her patient form a horrifyi...,The New York Times,"[{'name': 'subject', 'value': 'Movies', 'rank'...",2023-05-25T11:00:03+0000,295,"‘The Attachment Diaries’ Review: Love, Sick",,,The Attachment Diaries,,,,By Jeannette Catsoulis,"[{'firstname': 'Jeannette', 'middlename': None...",
1,https://www.nytimes.com/2023/05/04/movies/what...,Two childhood friends navigate cultural differ...,The New York Times,"[{'name': 'subject', 'value': 'Movies', 'rank'...",2023-05-04T17:16:45+0000,287,Review: ‘What’s Love Got to Do With It?’ Proba...,,,What’s Love Got to Do With It?,,,,By Jeannette Catsoulis,"[{'firstname': 'Jeannette', 'middlename': None...",
2,https://www.nytimes.com/2023/05/04/movies/you-...,Religion comes between two girls falling in lo...,The New York Times,"[{'name': 'subject', 'value': 'Movies', 'rank'...",2023-05-04T11:00:08+0000,294,‘You Can Live Forever’ Review: Do You Love Me ...,,,You Can Live Forever,,,,By Elisabeth Vincentelli,"[{'firstname': 'Elisabeth', 'middlename': None...",
3,https://www.nytimes.com/2023/04/21/movies/a-to...,Rachael Leigh Cook stars in this bland rom-com...,The New York Times,"[{'name': 'subject', 'value': 'Movies', 'rank'...",2023-04-21T07:03:25+0000,276,‘A Tourist’s Guide to Love’ Review: A Wearying...,,,A Tourist’s Guide to Love,,,,By Elisabeth Vincentelli,"[{'firstname': 'Elisabeth', 'middlename': None...",
4,https://www.nytimes.com/2023/04/20/movies/othe...,A radiant Virginie Efira stars as a Parisian t...,The New York Times,"[{'name': 'subject', 'value': 'Movies', 'rank'...",2023-04-20T15:35:13+0000,801,‘Other People’s Children’ Review: True Romance,Critic’s pick,,Intoxicating Love With a Sobering Turn,,,,By Manohla Dargis,"[{'firstname': 'Manohla', 'middlename': None, ...",


In [None]:
#convert 

In [None]:
# Extract the title from the "headline.main" column and
# save it to a new column "title"
df_reviews_list["title"] = df_reviews_list["headline.main"].apply(lambda st: st[st.find("\u2018")+1:st.find("\u2019 Review")])
# Title is between unicode characters \u2018 and \u2019. 
# End string should include " Review" to avoid cutting title early
df_reviews_list.head()


Unnamed: 0,web_url,snippet,source,keywords,pub_date,word_count,headline.main,headline.kicker,headline.content_kicker,headline.print_headline,headline.name,headline.seo,headline.sub,byline.original,byline.person,byline.organization,title
0,https://www.nytimes.com/2023/05/25/movies/the-...,A gynecologist and her patient form a horrifyi...,The New York Times,"[{'name': 'subject', 'value': 'Movies', 'rank'...",2023-05-25T11:00:03+0000,295,"‘The Attachment Diaries’ Review: Love, Sick",,,The Attachment Diaries,,,,By Jeannette Catsoulis,"[{'firstname': 'Jeannette', 'middlename': None...",,The Attachment Diaries
1,https://www.nytimes.com/2023/05/04/movies/what...,Two childhood friends navigate cultural differ...,The New York Times,"[{'name': 'subject', 'value': 'Movies', 'rank'...",2023-05-04T17:16:45+0000,287,Review: ‘What’s Love Got to Do With It?’ Proba...,,,What’s Love Got to Do With It?,,,,By Jeannette Catsoulis,"[{'firstname': 'Jeannette', 'middlename': None...",,What’s Love Got to Do With It?’ Probably a Lo
2,https://www.nytimes.com/2023/05/04/movies/you-...,Religion comes between two girls falling in lo...,The New York Times,"[{'name': 'subject', 'value': 'Movies', 'rank'...",2023-05-04T11:00:08+0000,294,‘You Can Live Forever’ Review: Do You Love Me ...,,,You Can Live Forever,,,,By Elisabeth Vincentelli,"[{'firstname': 'Elisabeth', 'middlename': None...",,You Can Live Forever
3,https://www.nytimes.com/2023/04/21/movies/a-to...,Rachael Leigh Cook stars in this bland rom-com...,The New York Times,"[{'name': 'subject', 'value': 'Movies', 'rank'...",2023-04-21T07:03:25+0000,276,‘A Tourist’s Guide to Love’ Review: A Wearying...,,,A Tourist’s Guide to Love,,,,By Elisabeth Vincentelli,"[{'firstname': 'Elisabeth', 'middlename': None...",,A Tourist’s Guide to Love
4,https://www.nytimes.com/2023/04/20/movies/othe...,A radiant Virginie Efira stars as a Parisian t...,The New York Times,"[{'name': 'subject', 'value': 'Movies', 'rank'...",2023-04-20T15:35:13+0000,801,‘Other People’s Children’ Review: True Romance,Critic’s pick,,Intoxicating Love With a Sobering Turn,,,,By Manohla Dargis,"[{'firstname': 'Manohla', 'middlename': None, ...",,Other People’s Children


In [None]:
# Extract 'name' and 'value' from items in "keywords" column
def extract_keywords(keyword_list):
    extracted_keywords = ""
    for item in keyword_list:
        # Extract 'name' and 'value'
        keyword = f"{item['name']}: {item['value']};" 
        # Append the keyword item to the extracted_keywords list
        extracted_keywords += keyword
    return extracted_keywords

# Fix the "keywords" column by converting cells from a list to a string
    df["keywords"] = df["keywords"].apply(extract_keywords)
    extract_keywords

In [None]:
# Create a list from the "title" column using to_list()
title_list = df_reviews_list["title"].to_list()
# These titles will be used in the query for The Movie Database


### Access The Movie Database API

In [20]:
# Prepare The Movie Database query
url = "https://api.themoviedb.org/3/search/movie?query="
tmdb_key_string = "&api_key=" + tmdb_api_key

In [21]:
# Create an empty list to store the results
results = []
# Create a request counter to sleep the requests after a multiple
# of 50 requests
# Initialize a request counter
request_counter = 0
# Loop through the pages
for page_number in range(20):
    # Construct the API request URL for the current page
    api_url = f"{url}page={page_number + 1}{tmdb_key_string}"
    # Make the request to the Movie Database API
    response = requests.get(api_url)
    # Increment the request counter
    request_counter += 1
    # Check if the request counter is a multiple of 50
    if request_counter % 50 == 0:
        # If it's a multiple of 50, sleep for a specified interval
        time.sleep(10)  # Sleep for 10 seconds, for example
    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the JSON response
        response_json = response.json()
        # Process the response and extract relevant data
        # For example, you might extract movie titles, release dates, etc.
        # Append the extracted data to the results list
        results.extend(response_json['results'])
    else:
        # If the request was not successful, print an error message
        print("Error:", response.status_code)
    # Make the API request here
    # Include a try clause to search for the full movie details.
    # Your code for making API requests and processing responses goes here
    # Build URL
    query_url = (
    f"{url}api-key={tmdb_api_key}&begin_date={begin_date}&end_date={end_date}"
    + f'&fq={filter_query}&sort={sort}&fl={field_list}')
    # Example: Print the current page number
    print("Processing page:", page_number)
# Make the request and process the response
# Define the query parameter for the movie search
# Construct the full URL for the search request
full_url = f"{url}api_key={tmdb_api_key}&query={query}"
try:
    # Make the API request to search for movies
    response = requests.get(full_url)
    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the JSON response
        search_results = response.json()
        # Process the search results and extract relevant information
        for movie in search_results["results"]:
            # For each movie in the search results, you might want to fetch additional details
            # Make additional requests to fetch full movie details, cast, etc.
            movie_id = movie["id"]
            # Example: Make a request to fetch full movie details
            movie_details_url = f"https://api.themoviedb.org/3/movie/{movie_id}?api_key={tmdb_api_key}"
            movie_details_response = requests.get(movie_details_url)
            if movie_details_response.status_code == 200:
                movie_details = movie_details_response.json()
                # Process and extract relevant information from movie_details
        # Your code for processing search results and extracting relevant information goes here
    else:
        # If the request was not successful, print an error message
        print("Error:", response.status_code)
except Exception as e:
    # If an exception occurs during the request, print the error message
    print("Error:", e)
# Loop through the titles
for title in title_list:
    # Check if we need to sleep before making a request
    if request_counter > 0 and request_counter % 50 == 0:
        print("Pausing script to avoid exceeding rate limit...")
        time.sleep(10)  # Sleep for 10 seconds, for example
    # Add 1 to the request counter
    request_counter += 1 
    # Perform a "GET" request for The Movie Database
    api_url = f"https://api.themoviedb.org/3/search/movie?api_key={tmdb_api_key}&query={title}"
    response = requests.get(api_url)
    display(response.json())
    # Check if the request was successful (status code 200) and Process the response as needed
    # print(response.json())
    # Include a try clause to search for the full movie details and extract the relevant data
try:
    # Make the API request to search for movies
    # Use the except clause to print out a statement or error if a movie is not found.
    full_movie_url = f"https://api.themoviedb.org/3/movie/{movie_id}?api_key={tmdb_api_key}"
    full_movie_response = requests.get(full_movie_url)
 # Check if the request was successful
    if full_movie_response.status_code == 200:
        # Parse the JSON response
        full_movie_details = full_movie_response.json()
        # Process and extract relevant information from full movie details
        # Example: Print movie details
        print(full_movie_details)
    else:
        # If the request was not successful, print an error message
        print("Error:", full_movie_response.status_code)
except Exception as e:
    # If an exception occurs during the request, print the error message
    print("Error:", e)
    # Use the except clause to print out a statement if a movie is not found
    print("Movie not found.")
        # Get movie id
    movie_id = response.json()["data"]["movie_id"]
        # Make a request for a the full movie details
    full_movie_url = f"https://api.themoviedb.org/3/movie/{movie_id}?api_key={tmdb_api_key}"
        # Execute "GET" request with url
    full_movie_response = requests.get(full_movie_url)
        # Extract the genre names into a list
    genres = [genre["name"] for genre in full_movie_response.json()["genres"]]
        # Extract the spoken_languages' English name into a list
    spoken_languages = [language["english_name"] for language in full_movie_response.json()["spoken_languages"]]
        # Extract the production_countries' name into a list
    production_countries = [country["name"] for country in full_movie_response.json()["production_countries"]]
        # Add the relevant data to a dictionary and append it to the tmdb_movies_list list
        
        # Print out the title that was found
    print(f"Title found: {title}")



Processing page: 0
Processing page: 1
Processing page: 2
Processing page: 3
Processing page: 4
Processing page: 5
Processing page: 6
Processing page: 7
Processing page: 8
Processing page: 9
Processing page: 10
Processing page: 11
Processing page: 12
Processing page: 13
Processing page: 14
Processing page: 15
Processing page: 16
Processing page: 17
Processing page: 18
Processing page: 19


NameError: name 'query' is not defined

In [None]:
# Preview the first 5 results in JSON format
# Use json.dumps with argument indent=4 to format data


In [None]:
# Convert the results to a DataFrame


### Merge and Clean the Data for Export

In [None]:
# Merge the New York Times reviews and TMDB DataFrames on title


In [None]:
# Remove list brackets and quotation marks on the columns containing lists
# Create a list of the columns that need fixing


# Create a list of characters to remove


# Loop through the list of columns to fix

    # Convert the column to type 'str'


    # Loop through characters to remove


# Display the fixed DataFrame


In [None]:
# Drop "byline.person" column


In [None]:
# Delete duplicate rows and reset index


In [None]:
# Export data to CSV without the index
