In [7]:
import requests
import json
import time

# Your NYT API Key
API_KEY = '2yofqXWDjFa5FGYVGIvFbd8WhwCBad37'

# Base URL for the NYT Article Search API
BASE_URL = 'https://api.nytimes.com/svc/search/v2/articlesearch.json'

# Search parameters
query = 'Taylor Swift'
begin_date = '20210101'  # Start date: January 1, 2021
end_date = '20250425'    # End date: April 25, 2025

# Function to fetch articles
def fetch_articles(query, begin_date, end_date, pages=10):
    all_articles = []
    
    for page in range(pages):
        params = {
            'q': query,
            'begin_date': begin_date,
            'end_date': end_date,
            'api-key': API_KEY,
            'page': page  # NYT API paginates results, 10 per page
        }
        
        response = requests.get(BASE_URL, params=params)
        if response.status_code == 200:
            data = response.json()
            docs = data['response']['docs']
            for doc in docs:
                article = {
                    'headline': doc['headline']['main'],
                    'snippet': doc.get('snippet', ''),
                    'abstract': doc.get('abstract', ''),  # <-- NEW: Collect abstract
                    'web_url': doc['web_url'],
                    'pub_date': doc['pub_date'],
                    'section_name': doc.get('section_name', ''),
                    'keywords': [kw['value'] for kw in doc.get('keywords', [])]
                }
                all_articles.append(article)
        else:
            print(f"Error: {response.status_code}")
        
        # Be nice to NYT servers — avoid rate limiting
        time.sleep(10)
    
    return all_articles

# Fetch articles mentioning "Taylor Swift"
articles = fetch_articles(query, begin_date, end_date, pages=10)  # You can increase pages if needed

# Save to a JSON file
with open('taylor_swift_articles.json', 'w', encoding='utf-8') as f:
    json.dump(articles, f, indent=4)

print(f"Saved {len(articles)} articles mentioning Taylor Swift.")


Saved 100 articles mentioning Taylor Swift.


In [14]:
import requests
import json
import time
import os

# Your NYT API Key
API_KEY = '2yofqXWDjFa5FGYVGIvFbd8WhwCBad37'

# Base URL for the NYT Article Search API
BASE_URL = 'https://api.nytimes.com/svc/search/v2/articlesearch.json'

# Search parameters (last 6 months)
begin_date = '20241025'  # October 25, 2024
end_date = '20250425'    # April 25, 2025

# Artist list
artists = [
    "Kanye West", "Sabrina Carpenter", "Kendrick Lamar", "SZA",
    "Drake", "Chappell Roan", "Post Malone", "Lady Gaga",
    "Benson Boone", "Billie Eilish", "The Weeknd", "Gracie Abrams",
    "Teddy Swims", "Taylor Swift", "Bad Bunny", "Ariana Grande",
    "Travis Scott", "Rihanna", "Ed Sheeran", "Beyoncé",
    "Bruno Mars", "Charli XCX"
]

# Function to fetch articles
def fetch_articles(query, artist, begin_date, end_date, pages=10):
    all_articles = []
    
    # Create the correct artist keyword format: e.g., "Abrams, Gracie"
    artist_parts = artist.split()
    if len(artist_parts) >= 2:
        artist_keyword = f"{artist_parts[1]}, {artist_parts[0]}"
    else:
        artist_keyword = artist  # for one-word names like "Drake" or "SZA"

    for page in range(pages):
        params = {
            'q': query,
            'begin_date': begin_date,
            'end_date': end_date,
            'api-key': API_KEY,
            'page': page
        }
        
        while True:
            response = requests.get(BASE_URL, params=params)
            if response.status_code == 200:
                data = response.json()
                docs = data['response'].get('docs', [])
                if docs:
                    for doc in docs:
                        keywords = [kw['value'] for kw in doc.get('keywords', [])]

                        # Filter: keep only articles where artist name appears exactly in keywords
                        if artist_keyword not in keywords:
                            continue  # skip unrelated articles

                        article = {
                            'headline': doc['headline']['main'],
                            'snippet': doc.get('snippet', ''),
                            'abstract': doc.get('abstract', ''),
                            'web_url': doc['web_url'],
                            'pub_date': doc['pub_date'],
                            'section_name': doc.get('section_name', ''),
                            'keywords': keywords
                        }
                        all_articles.append(article)
                    time.sleep(10)  # wait nicely after each page
                    break  # successful page, move to next
                else:
                    print(f"No articles found on page {page} for query '{query}'. Stopping further requests.")
                    return all_articles
            elif response.status_code == 429:
                print("Rate limited. Sleeping for 60 seconds...")
                time.sleep(60)  # wait before retry
            else:
                print(f"Error: {response.status_code}")
                return all_articles

    return all_articles

# Create a folder to save all JSON files
os.makedirs('artist_articles', exist_ok=True)

# Loop through all artists and fetch/save articles
for artist in artists:
    print(f"Fetching articles for {artist}...")
    articles = fetch_articles(artist, artist, begin_date, end_date, pages=10)
    
    # Save to a JSON file
    safe_artist_name = artist.lower().replace(" ", "_")  # e.g., "Taylor Swift" -> "taylor_swift"
    filename = f'artist_articles/{safe_artist_name}.json'
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(articles, f, indent=4)
    
    if not articles:
        print(f"WARNING: No articles found for {artist}!\n")
    else:
        print(f"Saved {len(articles)} articles for {artist}.\n")

print("All artist articles collected successfully.")


Fetching articles for Morgan Wallen...
No articles found on page 2 for query 'Morgan Wallen'. Stopping further requests.
Saved 8 articles for Morgan Wallen.

Fetching articles for Sabrina Carpenter...
Rate limited. Sleeping for 60 seconds...


KeyboardInterrupt: 