In [1]:
import requests
import pandas as pd
import os
import json

# API Key and Base URL
API_KEY = "87849c17e14a4d4ba4b577cdbb004b61"
BASE_URL = "https://newsapi.org/v2/top-headlines"

# Categories 
CATEGORIES = ["health", "business", "finance", "politics", "news"]
TOTAL_ARTICLES = 400

# Function to fetch news articles from NewsAPI
def fetch_news(category, page_size):
    params = {
        "category": category,
        "language": "en",
        "pageSize": page_size,
        "apiKey": API_KEY,
    }
    
    response = requests.get(BASE_URL, params=params)
    if response.status_code == 200:
        return response.json().get("articles", [])
    else:
        print(f"Error fetching data for {category}: {response.status_code}")
        return []

# Fetch articles
all_articles = []
per_category_limit = TOTAL_ARTICLES // len(CATEGORIES)  #Divide evenly

for category in CATEGORIES:
    articles = fetch_news(category, per_category_limit)
    for article in articles:
        article["category"] = category  # Add category to each article
    all_articles.extend(articles)

# Convert to DataFrame
df = pd.DataFrame(all_articles)

# Data Cleaning
df = df.drop(columns=["source", "urlToImage", "content"], errors="ignore")  # Remove unwanted columns
df = df.drop_duplicates(subset=["title"])  # Remove duplicate titles
df = df.dropna(subset=["title", "description", "url"])  # Remove rows with missing data

# Limit to 300 articles 
df = df.head(TOTAL_ARTICLES)

# Display the cleaned DataFrame
df.index = range(1, len(df) + 1)  
display(df)

# Create a folder to store JSON files 
FOLDER_NAME = "news_articles"
os.makedirs(FOLDER_NAME, exist_ok=True)

# Save each article as a JSON file
for idx, article in df.iterrows():
    # Convert row to dictionary
    article_dict = article.to_dict()

    # Define JSON file path 
    file_path = os.path.join(FOLDER_NAME, f"article_{idx + 1}.json")

    # Save as JSON
    with open(file_path, "w", encoding="utf-8") as json_file:
        json.dump(article_dict, json_file, indent=4)

    print(f"Saved: {file_path}")

print(f" All {len(df)} articles saved as JSON files in the '{FOLDER_NAME}' folder.")



Unnamed: 0,author,title,description,url,publishedAt,category
1,Rhiannon Ingle,Everything that happens to your body if you st...,There are a number of health benefits to havin...,https://www.tyla.com/life/sex-and-relationship...,2025-03-10T14:52:26Z,health
2,Kamal Nahas,We may finally understand how metformin lowers...,The common diabetes drug metformin works partl...,https://www.livescience.com/health/diabetes/we...,2025-03-10T14:40:00Z,health
3,Carrie Dennett,Are ultraprocessed foods the dietary demons th...,Registered dietitian nutritionist Carrie Denne...,https://www.seattletimes.com/life/wellness/are...,2025-03-10T13:00:00Z,health
4,,Is Fluoride Lowering Children’s IQ? New Resear...,Fluoride is added to drinking water in many co...,https://scitechdaily.com/is-fluoride-lowering-...,2025-03-10T12:31:30Z,health
5,,Sounds Like Science Fiction: Doctors Implant a...,Doctors in Canada have just performed a surger...,https://dailygalaxy.com/2025/03/sounds-like-sc...,2025-03-10T12:20:00Z,health
...,...,...,...,...,...,...
111,Tim Benz,Tim Benz: How I want the Steelers' dominoes to...,I like the Pittsburgh Steelers&rsquo; decision...,https://triblive.com/sports/tim-benz-how-i-wan...,2025-03-10T10:28:00Z,politics
112,Jamie Carter,This month's 'blood moon' eclipse mirrors one ...,"On March 13-14, 2025, skywatchers in the Ameri...",https://www.space.com/march-2025-eclipse-echoe...,2025-03-10T10:00:00Z,politics
113,Neal Augenstein,Family of missing Loudoun Co. college student ...,"The father of 20-year-old Sudiksha Konanki, of...",https://wtop.com/loudoun-county/2025/03/family...,2025-03-10T09:00:56Z,politics
114,Jason Bittel,Maybe megalodon wasn’t so chonky after all - N...,A new study proposes that the massive ancient ...,https://www.nationalgeographic.com/science/art...,2025-03-09T23:18:19Z,politics


Saved: news_articles/article_2.json
Saved: news_articles/article_3.json
Saved: news_articles/article_4.json
Saved: news_articles/article_5.json
Saved: news_articles/article_6.json
Saved: news_articles/article_7.json
Saved: news_articles/article_8.json
Saved: news_articles/article_9.json
Saved: news_articles/article_10.json
Saved: news_articles/article_11.json
Saved: news_articles/article_12.json
Saved: news_articles/article_13.json
Saved: news_articles/article_14.json
Saved: news_articles/article_15.json
Saved: news_articles/article_16.json
Saved: news_articles/article_17.json
Saved: news_articles/article_18.json
Saved: news_articles/article_19.json
Saved: news_articles/article_20.json
Saved: news_articles/article_21.json
Saved: news_articles/article_22.json
Saved: news_articles/article_23.json
Saved: news_articles/article_24.json
Saved: news_articles/article_25.json
Saved: news_articles/article_26.json
Saved: news_articles/article_27.json
Saved: news_articles/article_28.json
Saved: ne