In [1]:
import requests
import json
import time
import os

# Your NewsAPI.org API Key
NEWS_API_KEY = '4aa14a1c90d54eeaaf62492b70e62dac'

# Base URL for NewsAPI
BASE_URL = 'https://newsapi.org/v2/everything'

# Search parameters (last 6 months)
from_date = '2024-10-25'  # October 25, 2024
to_date = '2025-04-25'    # April 25, 2025

# Artist list
artists = [
   "Kanye West", "Sabrina Carpenter", "Kendrick Lamar", "SZA",
    "Drake", "Chappell Roan", "Post Malone", "Lady Gaga",
    "Benson Boone", "Billie Eilish", "The Weeknd", "Gracie Abrams",
    "Teddy Swims", "Taylor Swift", "Bad Bunny", "Ariana Grande",
    "Travis Scott", "Rihanna", "Ed Sheeran", "Beyoncé",
    "Bruno Mars", "Charli XCX"]

# Function to fetch articles
def fetch_newsapi_articles(artist, from_date, to_date, pages=5):
    all_articles = []
    page = 1
    
    while page <= pages:
        params = {
            'q': artist,
            'from': from_date,
            'to': to_date,
            'language': 'en',
            'sortBy': 'relevancy',  # You can also try 'popularity' or 'publishedAt'
            'pageSize': 100,  # 100 is the max allowed per page
            'page': page,
            'apiKey': NEWS_API_KEY
        }
        
        response = requests.get(BASE_URL, params=params)
        
        if response.status_code == 200:
            data = response.json()
            articles = data.get('articles', [])
            
            if not articles:
                print(f"No more articles found for {artist}. Stopping.")
                break

            for item in articles:
                article = {
                    'source': item['source']['name'],
                    'author': item.get('author', ''),
                    'title': item['title'],
                    'description': item.get('description', ''),
                    'url': item['url'],
                    'publishedAt': item['publishedAt'],
                    'content': item.get('content', '')
                }
                all_articles.append(article)
            
            total_results = data.get('totalResults', 0)
            if page * 100 >= total_results:
                break  # All results fetched
            page += 1
            time.sleep(2)  # polite wait
        
        elif response.status_code == 429:
            print("Rate limited by NewsAPI. Sleeping for 60 seconds...")
            time.sleep(60)
            continue
        
        else:
            print(f"Error {response.status_code}: {response.text}")
            break

    return all_articles

# Create a folder to save NewsAPI artist articles
os.makedirs('newsapi_artist_articles', exist_ok=True)

# Loop through all artists and fetch/save articles
for artist in artists:
    print(f"Fetching NewsAPI articles for {artist}...")
    articles = fetch_newsapi_articles(artist, from_date, to_date, pages=5)
    
    # Save to a JSON file
    safe_artist_name = artist.lower().replace(" ", "_")
    filename = f'newsapi_artist_articles/{safe_artist_name}.json'
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(articles, f, indent=4)
    
    if not articles:
        print(f"WARNING: No articles found for {artist}!\n")
    else:
        print(f"Saved {len(articles)} articles for {artist}.\n")

print("All NewsAPI artist articles collected successfully.")


Fetching NewsAPI articles for Kanye West...
Error 426: {"status":"error","code":"parameterInvalid","message":"You are trying to request results too far in the past. Your plan permits you to request articles as far back as 2025-03-24, but you have requested 2024-10-25. You may need to upgrade to a paid plan."}

Fetching NewsAPI articles for Sabrina Carpenter...
Error 426: {"status":"error","code":"parameterInvalid","message":"You are trying to request results too far in the past. Your plan permits you to request articles as far back as 2025-03-24, but you have requested 2024-10-25. You may need to upgrade to a paid plan."}

Fetching NewsAPI articles for Kendrick Lamar...
Error 426: {"status":"error","code":"parameterInvalid","message":"You are trying to request results too far in the past. Your plan permits you to request articles as far back as 2025-03-24, but you have requested 2024-10-25. You may need to upgrade to a paid plan."}

Fetching NewsAPI articles for SZA...
Error 426: {"sta

In [4]:
import requests
import json
import time
import os

# Base URL for GDELT 2.0 DOC API
BASE_URL = 'https://api.gdeltproject.org/api/v2/doc/doc'

# Date range
start_date = '2025-03-01'
end_date = '2025-04-25'

# Artist list
artists = [
   "Kanye West", "Sabrina Carpenter", "Kendrick Lamar", "SZA",
    "Drake", "Chappell Roan", "Post Malone", "Lady Gaga",
    "Benson Boone", "Billie Eilish", "The Weeknd", "Gracie Abrams",
    "Teddy Swims", "Taylor Swift", "Bad Bunny", "Ariana Grande",
    "Travis Scott", "Rihanna", "Ed Sheeran", "Beyoncé",
    "Bruno Mars", "Charli XCX"
]

# Function to fetch English-only articles
def fetch_gdelt_articles(artist):
    all_articles = []
    
    params = {
        'query': artist,
        'mode': 'artlist',
        'format': 'json',
        'maxrecords': 250,
        'sort': 'DateDesc',
        'startdatetime': start_date.replace('-', '') + '000000',
        'enddatetime': end_date.replace('-', '') + '235959'
    }
    
    try:
        response = requests.get(BASE_URL, params=params)
        
        if response.status_code == 200:
            data = response.json()
            articles = data.get('articles', [])
            
            for item in articles:
                # Filter: only keep English articles
                if item.get('language', '').lower() != 'english':
                    continue

                article = {
                    'seendate': item.get('seendate', ''),
                    'url': item.get('url', ''),
                    'title': item.get('title', ''),
                    'sourcecountry': item.get('sourcecountry', ''),
                    'language': item.get('language', ''),
                    'domain': item.get('domain', '')
                }
                all_articles.append(article)
            
            time.sleep(2)
        
        else:
            print(f"Error {response.status_code}: {response.text}")
    
    except Exception as e:
        print(f"Exception occurred: {e}")
    
    return all_articles

# Create a folder to save GDELT artist articles
os.makedirs('gdelt_artist_articles', exist_ok=True)

# Loop through all artists and fetch/save articles
for artist in artists:
    print(f"Fetching GDELT articles for {artist}...")
    articles = fetch_gdelt_articles(artist)
    
    # Save to a JSON file
    safe_artist_name = artist.lower().replace(" ", "_")
    filename = f'gdelt_artist_articles/{safe_artist_name}.json'
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(articles, f, indent=4)
    
    if not articles:
        print(f"WARNING: No English articles found for {artist}!\n")
    else:
        print(f"Saved {len(articles)} English articles for {artist}.\n")

print("All English GDELT artist articles collected successfully.")


Fetching and filtering GDELT articles for Kanye West...

Fetching and filtering GDELT articles for Sabrina Carpenter...

Fetching and filtering GDELT articles for Kendrick Lamar...

Fetching and filtering GDELT articles for SZA...


KeyboardInterrupt: 