## Step 1: Data Collection

In [None]:
!pip install newsapi-python

Collecting newsapi-python
  Downloading newsapi_python-0.2.7-py2.py3-none-any.whl.metadata (1.2 kB)
Downloading newsapi_python-0.2.7-py2.py3-none-any.whl (7.9 kB)
Installing collected packages: newsapi-python
Successfully installed newsapi-python-0.2.7


In [None]:
from newsapi import NewsApiClient
import json
import pandas as pd
from datetime import datetime

In [None]:
# Define the topics and categories relevant to the project
topics = ['politics', 'economy', 'natural disasters', 'business', 'technology', 'health', 'sports', 'education']

In [None]:
# Init
newsapi = NewsApiClient(api_key='0f2f8e9d8cf9475aaaad769aa1ab95a3')

In [None]:
# Collect articles for each topic within a broader time range
all_articles = []
for topic in topics:
    articles = newsapi.get_everything(q=topic,
                                      domains='bbc.co.uk,nytimes.com,reuters.com,techcrunch.com,engadget.com',  # Add more news domains as needed
                                      from_param='2024-09-12',  # Adjust the time range as needed
                                      to='2024-10-11',
                                      language='en',
                                      sort_by='relevancy',
                                      page_size=100,  # Number of articles per page, you can paginate if needed
                                      page=1)  # You can loop through pages for more articles
    all_articles.append(articles)

In [None]:
all_articles

In [None]:
# Save the collected data to a JSON file
with open('news_articles.json', 'w') as json_file:
    json.dump(all_articles, json_file, indent=4)

print("Articles saved to news_articles.json")

Articles saved to news_articles.json


In [4]:
# Load the JSON data
file_path = './news_articles.json'
with open(file_path, 'r') as f:
    data = json.load(f)

In [11]:
# Prepare lists to store structured data
titles = []
published_dates = []
sources = []
authors = []
description = []
urls = []

In [12]:
# Loop through the articles and extract the required fields
for item in data:
    for article in item.get('articles', []):
        titles.append(article.get('title', 'N/A'))
        published_dates.append(article.get('publishedAt', 'N/A'))
        sources.append(article['source'].get('name', 'N/A'))
        authors.append(article.get('author', 'N/A'))
        description.append(article.get('description', 'N/A'))
        urls.append(article.get('url', 'N/A'))

In [13]:
# Create a DataFrame to organize the extracted data
df = pd.DataFrame({
    'Title': titles,
    'Published Date': published_dates,
    'Source': sources,
    'Author': authors,
    'Description': description,
    'URL': urls
})

In [14]:
# Convert 'Published Date' to a proper datetime format
df['Published Date'] = pd.to_datetime(df['Published Date'], errors='coerce')

In [15]:
# Save the structured data as a JSON file
output_file_path = './structured_news_articles.json'
df.to_json(output_file_path, orient='records', lines=True)

output_file_path

'./structured_news_articles.json'