#### The Guardian News ####

In [1]:
import requests
import pandas as pd
import time

In [2]:
GUARDIAN_API_KEY = "6bbee1ea-b550-4b54-b3b1-697aaa0962db"  # <--- MY KEY HERE!
GUARDIAN_API_URL = "https://content.guardianapis.com/search"

In [3]:
def get_guardian_articles(api_key, query, from_date, to_date, page=1):
    """Fetches articles from The Guardian API."""
    params = {
        "q": query,
        "api-key": api_key,
        "from-date": from_date, # Format: YYYY-MM-DD
        "to-date": to_date,     # Format: YYYY-MM-DD
        "show-fields": "headline,trailText,publication", # Get headline and snippet (trailText)
        "page-size": 50, # Max allowed is 50
        "page": page,
        "order-by": "newest"
    }
    try:
        response = requests.get(GUARDIAN_API_URL, params=params)
        response.raise_for_status()  # Raise an exception for bad status codes
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching Guardian data: {e}")
        return None




In [4]:
query = "South Korea"
start_date = "2024-01-01"
end_date = "2024-12-31"
all_articles = []
current_page = 1
total_pages = 1 # Start with 1, will update after first call

print("Fetching from The Guardian...")
while current_page <= total_pages:
    data = get_guardian_articles(GUARDIAN_API_KEY, query, start_date, end_date, current_page)

    if data and data.get("response"):
        response_data = data["response"]
        total_pages = response_data.get("pages", 1) # Update total pages
        results = response_data.get("results", [])

        for item in results:
            fields = item.get("fields", {})
            all_articles.append({
                "source": "The Guardian",
                "date": item.get("webPublicationDate", "").split("T")[0], # Get just the date part
                "headline": fields.get("headline", ""),
                "snippet": fields.get("trailText", ""),
                "url": item.get("webUrl", "")
            })
        print(f"  Page {current_page}/{total_pages} fetched, {len(results)} articles.")

        if current_page >= total_pages:
            break # Exit loop if we've fetched all pages

        current_page += 1
        time.sleep(1) # IMPORTANT: Be polite, wait a second between calls
    else:
        print("  No data or error, stopping.")
        break

guardian_df = pd.DataFrame(all_articles)
print(f"Collected {len(guardian_df)} articles from The Guardian.")
# print(guardian_df.head()) # Uncomment to see first few rows

Fetching from The Guardian...
  Page 1/65 fetched, 50 articles.
  Page 2/65 fetched, 50 articles.
  Page 3/65 fetched, 50 articles.
  Page 4/65 fetched, 50 articles.
  Page 5/65 fetched, 50 articles.
  Page 6/65 fetched, 50 articles.
  Page 7/65 fetched, 50 articles.
  Page 8/65 fetched, 50 articles.
  Page 9/65 fetched, 50 articles.
  Page 10/65 fetched, 50 articles.
  Page 11/65 fetched, 50 articles.
  Page 12/65 fetched, 50 articles.
  Page 13/65 fetched, 50 articles.
  Page 14/65 fetched, 50 articles.
  Page 15/65 fetched, 50 articles.
  Page 16/65 fetched, 50 articles.
  Page 17/65 fetched, 50 articles.
  Page 18/65 fetched, 50 articles.
  Page 19/65 fetched, 50 articles.
  Page 20/65 fetched, 50 articles.
  Page 21/65 fetched, 50 articles.
  Page 22/65 fetched, 50 articles.
  Page 23/65 fetched, 50 articles.
  Page 24/65 fetched, 50 articles.
  Page 25/65 fetched, 50 articles.
  Page 26/65 fetched, 50 articles.
  Page 27/65 fetched, 50 articles.
  Page 28/65 fetched, 50 articles.

In [7]:
print(guardian_df.head())

         source        date  \
0  The Guardian  2024-12-31   
1  The Guardian  2024-12-31   
2  The Guardian  2024-12-31   
3  The Guardian  2024-12-31   
4  The Guardian  2024-12-31   

                                            headline  \
0  Trump v the world: Inside the 3 January Guardi...   
1  South Korea plane crash investigators turn to ...   
2  South Korea plane crash investigations focus o...   
3  ‘Sex strikes’ aren’t the feminist win they app...   
4  Green light: the boss of GB Railfreight with a...   

                                             snippet  \
0  Global leaders pivot to face Trump 2.0. <stron...   
1  Experts hope flight recorders will provide ans...   
2  Jeju Air flight burst into flames when it hit ...   
3  The problem with the 4B movement is that it pl...   
4  Teenage trainspotting, plus a passion for the ...   

                                                 url  
0  https://www.theguardian.com/news/2024/dec/31/t...  
1  https://www.theguardian.co

#### The New York Times (Using the API)

In [8]:
NYT_API_KEY = "GeYfArnC7aG0AugYUYkzcns5VgQG2bqw"  # <--- MY KEY HERE!
NYT_API_URL = "https://api.nytimes.com/svc/search/v2/articlesearch.json"

In [18]:
def get_nyt_articles(api_key, query, begin_date, end_date, page=0):
    """Fetches articles from The NYT Article Search API."""
    params = {
        "q": query,
        "api-key": api_key,
        "begin_date": begin_date,
        "end_date": end_date,
        "sort": "newest",
        "fl": "headline,abstract,pub_date,web_url",
        "page": page
    }
    try:
        response = requests.get(NYT_API_URL, params=params)
        # Check specifically for 429 error before raising generally
        if response.status_code == 429:
            print("  Hit Rate Limit (429). Waiting for 60 seconds before retrying...")
            time.sleep(60) # Wait a full minute if we hit the limit
            response = requests.get(NYT_API_URL, params=params) # Retry once

        response.raise_for_status()  # Raise an exception for other bad status codes
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching NYT data: {e}")
        return None


In [19]:
query = "South Korea"
start_date = "20240501" # Changed to May 1st
end_date = "20250525"   # Changed to May 25th
all_articles_nyt = [] # Use a different name to avoid confusion
current_page = 0
max_pages = 100

print("\nFetching from The New York Times...")
while current_page < max_pages:
    # Use the keys you provided (kept private)
    data = get_nyt_articles(NYT_API_KEY, query, start_date, end_date, current_page)

    if data and data.get("response") and data["response"].get("docs"):
        docs = data["response"]["docs"]
        if not docs:
            print("  No more articles found.")
            break

        for item in docs:
            all_articles_nyt.append({
                "source": "New York Times",
                "date": item.get("pub_date", "").split("T")[0],
                "headline": item.get("headline", {}).get("main", ""),
                "snippet": item.get("abstract", ""),
                "url": item.get("web_url", "")
            })
        print(f"  Page {current_page} fetched, {len(docs)} articles.")

        current_page += 1
        # NEW SLEEP TIME: Wait 13 seconds (60 / 13 is less than 5 calls/min)
        time.sleep(13) # IMPORTANT: Increased delay!
    else:
        # Check for error message in response
        if data and data.get("message"):
            print(f"  API Message: {data['message']}")
        elif data and 'fault' in data:
            print(f"  API Fault: {data['fault'].get('faultstring')}")
        elif data and 'status' in data and data['status'] != 'OK':
             print(f"  API Error: {data}")
        else:
             print("  No data or error, stopping.")
        break

nyt_df = pd.DataFrame(all_articles_nyt)
print(f"Collected {len(nyt_df)} articles from The New York Times.")


Fetching from The New York Times...
  Page 0 fetched, 10 articles.
  Page 1 fetched, 10 articles.
  Page 2 fetched, 10 articles.
  Page 3 fetched, 10 articles.
  Page 4 fetched, 10 articles.
  Page 5 fetched, 10 articles.
  Page 6 fetched, 10 articles.
  Page 7 fetched, 10 articles.
  Page 8 fetched, 10 articles.
  Page 9 fetched, 10 articles.
  Page 10 fetched, 10 articles.
  Page 11 fetched, 10 articles.
  Page 12 fetched, 10 articles.
  Page 13 fetched, 10 articles.
  Page 14 fetched, 10 articles.
  Page 15 fetched, 10 articles.
  Page 16 fetched, 10 articles.
  Page 17 fetched, 10 articles.
  Page 18 fetched, 10 articles.
  Page 19 fetched, 10 articles.
  Page 20 fetched, 10 articles.
  Page 21 fetched, 10 articles.
  Page 22 fetched, 10 articles.
  Page 23 fetched, 10 articles.
  Page 24 fetched, 10 articles.
  Page 25 fetched, 10 articles.
  Page 26 fetched, 10 articles.
  Page 27 fetched, 10 articles.
  Page 28 fetched, 10 articles.
  Page 29 fetched, 10 articles.
  Page 30 fet

In [21]:
print(nyt_df.head()) # Uncomment to see first few rows

           source        date  \
0  New York Times  2025-05-23   
1  New York Times  2025-05-22   
2  New York Times  2025-05-22   
3  New York Times  2025-05-22   
4  New York Times  2025-05-21   

                                            headline  \
0  Shock at Harvard After Government Says Interna...   
1  Read the Full ‘Make America Healthy Again’ Report   
2  Southwest Airlines to Tighten Restrictions on ...   
3                                     Formally Picks   
4                    Embracing the Soft Power of Art   

                                             snippet  \
0  Fear and confusion mounted quickly on Thursday...   
1  The White House released an expansive report t...   
2  The carrier will require passengers to keep li...   
3       David J. Kahn returns with an epic Thursday.   
4  Across the globe, more and more cities and cou...   

                                                 url  
0  https://www.nytimes.com/2025/05/22/us/harvard-...  
1  https://www.ny

#### Al Jazeera (Using RSS Feed)

In [33]:


NEWS_API_KEY = "c52200b370334852b307513e88ce870a" # <--- MY KEY HERE!
NEWS_API_URL = "https://newsapi.org/v2/everything"


In [34]:

def get_newsapi_articles(api_key, query, from_date, to_date, source_id):
    """Fetches articles from NewsAPI.org for a specific source."""
    params = {
        "q": query,
        "apiKey": api_key,
        "from": from_date, # Format: 2025-04-26T00:00:00
        "to": to_date,     # Format: 2025-05-26T12:00:00
        "sources": source_id, # e.g., 'al-jazeera-english', 'bbc-news', 'abc-news-au'
        "pageSize": 100, # Max 100
        "language": "en"
    }
    try:
        response = requests.get(NEWS_API_URL, params=params)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching NewsAPI data: {e}")
        return None

# --- How to use it ---
# You MUST check NewsAPI.org for available source IDs.
# Let's try 'al-jazeera-english'
# REMEMBER: Likely only last 30 days!
query = "South Korea"
# Adjust dates to be within the last 30 days for testing
start_date_newsapi = "2025-04-26T00:00:00"
end_date_newsapi = "2025-05-26T12:00:00"
source = "al-jazeera-english" # Or 'bbc-news', 'abc-news-au', etc.
all_newsapi_articles = []

print(f"\nFetching from {source} via NewsAPI...")
data = get_newsapi_articles(NEWS_API_KEY, query, start_date_newsapi, end_date_newsapi, source)

if data and data.get("status") == "ok":
    articles = data.get("articles", [])
    for item in articles:
        all_newsapi_articles.append({
            "source": item.get("source", {}).get("name", source),
            "date": item.get("publishedAt", "").split("T")[0],
            "headline": item.get("title", ""),
            "snippet": item.get("description", ""),
            "url": item.get("url", "")
        })
    print(f"  Collected {len(all_newsapi_articles)} articles.")
else:
    print(f"  Could not fetch articles. Status: {data.get('status')}, Message: {data.get('message')}")

newsapi_df = pd.DataFrame(all_newsapi_articles)



Fetching from al-jazeera-english via NewsAPI...
  Collected 21 articles.


In [35]:
print(newsapi_df.head())

               source        date  \
0  Al Jazeera English  2025-05-08   
1  Al Jazeera English  2025-04-27   
2  Al Jazeera English  2025-05-02   
3  Al Jazeera English  2025-05-18   
4  Al Jazeera English  2025-05-01   

                                            headline  \
0  North Korea fires missiles off east coast, Sou...   
1  S Korea’s main opposition party taps former ch...   
2  South Korea appoints new acting leader as ex-P...   
3  South Korea’s presidential candidates hold fir...   
4  Former South Korean President Yoon indicted fo...   

                                             snippet  \
0  Seoul’s military says launches may have been t...   
1  Democratic Party of Korea names Lee Jae-myung ...   
2  Han Duck-soo declares candidacy in June 3 elec...   
3  Candidates Lee Jae-myung, the frontrunner, and...   
4  Yoon Suk-yeol already faces ongoing trial on i...   

                                                 url  
0  https://www.aljazeera.com/news/2025/5/8/north

#### The Guardian: 3233 articles
#### The New York Times: 1000 articles
#### Al Jazeera (via NewsAPI): 21 articles
#### Total: 4254 articles!

In [36]:
dfs_to_combine = []
if 'guardian_df' in locals():
    dfs_to_combine.append(guardian_df)
    print(f"Guardian DF found: {len(guardian_df)} rows")
if 'nyt_df' in locals():
    dfs_to_combine.append(nyt_df)
    print(f"NYT DF found: {len(nyt_df)} rows")
if 'newsapi_df' in locals():
    dfs_to_combine.append(newsapi_df)
    print(f"NewsAPI DF found: {len(newsapi_df)} rows")

if dfs_to_combine:
    # Combine them all
    all_news_df = pd.concat(dfs_to_combine, ignore_index=True)

    print(f"\nTotal articles combined: {len(all_news_df)}")

    # Display a sample and check info
    print("\nSample of combined data:")
    print(all_news_df.head())
    print("\nDataFrame Info:")
    all_news_df.info()

    # Save the master dataset to a CSV file
    try:
        all_news_df.to_csv("south_korea_news_all.csv", index=False)
        print("\nSuccessfully saved combined data to 'south_korea_news_all.csv'")
    except Exception as e:
        print(f"\nError saving CSV: {e}")
else:
    print("Could not find DataFrames to combine. Please check your variables.")

Guardian DF found: 3233 rows
NYT DF found: 1000 rows
NewsAPI DF found: 21 rows

Total articles combined: 4254

Sample of combined data:
         source        date  \
0  The Guardian  2024-12-31   
1  The Guardian  2024-12-31   
2  The Guardian  2024-12-31   
3  The Guardian  2024-12-31   
4  The Guardian  2024-12-31   

                                            headline  \
0  Trump v the world: Inside the 3 January Guardi...   
1  South Korea plane crash investigators turn to ...   
2  South Korea plane crash investigations focus o...   
3  ‘Sex strikes’ aren’t the feminist win they app...   
4  Green light: the boss of GB Railfreight with a...   

                                             snippet  \
0  Global leaders pivot to face Trump 2.0. <stron...   
1  Experts hope flight recorders will provide ans...   
2  Jeju Air flight burst into flames when it hit ...   
3  The problem with the 4B movement is that it pl...   
4  Teenage trainspotting, plus a passion for the ...   

   