In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [4]:
BASE_URL = "https://www.theguardian.com"
url = BASE_URL  

response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
soup = BeautifulSoup(response.content, 'html.parser')

In [5]:
categories = soup.select("ul.dcr-1d6g26q li a")
category_links = [{"name": c.text.strip(), "url": BASE_URL + c["href"] if c["href"].startswith("/") else c["href"]} for c in categories]


In [6]:
subcategory_links = []
for category in category_links:
    subcategories = soup.select(f'ul#{category["name"].lower()}Links li a')
    subcategory_links += [{"name": s.text.strip(), "url": BASE_URL + s["href"] if s["href"].startswith("/") else s["href"], "category": category["name"]} for s in subcategories]

In [7]:
print("Categories:")
for cat in category_links: print(f"{cat['name']} - {cat['url']}")
print("\nSubcategories:")
for subcat in subcategory_links: print(f"{subcat['category']} > {subcat['name']} - {subcat['url']}")

Categories:
News - https://www.theguardian.com/
Opinion - https://www.theguardian.com/commentisfree
Sport - https://www.theguardian.com/sport
Culture - https://www.theguardian.com/culture
Lifestyle - https://www.theguardian.com/lifeandstyle
News - https://www.theguardian.com/
Opinion - https://www.theguardian.com/commentisfree
Sport - https://www.theguardian.com/sport
Culture - https://www.theguardian.com/culture
Lifestyle - https://www.theguardian.com/lifeandstyle

Subcategories:
News > View all News - https://www.theguardian.com/
News > World news - https://www.theguardian.com/world
News > US politics - https://www.theguardian.com/us-news/us-politics
News > UK news - https://www.theguardian.com/uk-news
News > Climate crisis - https://www.theguardian.com/environment/climate-crisis
News > Middle East - https://www.theguardian.com/world/middleeast
News > Ukraine - https://www.theguardian.com/world/ukraine
News > Environment - https://www.theguardian.com/environment
News > Science - http

In [None]:
def scrape_article_details(article_url, category_name):
    response = requests.get(article_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    headline = soup.select_one("h1")
    headline_text = headline.get_text(strip=True) if headline else None
    
    publish_date = soup.select_one("time") or soup.find('span', class_='dcr-u0h1qy')
    publish_date_text = publish_date.get_text(strip=True) if publish_date else None
    
    content_paragraphs = soup.find_all('p', class_='dcr-s3ycb2')
    content_text = '\n'.join([p.get_text(strip=True) for p in content_paragraphs]) if content_paragraphs else None
    
    return {
        "url": article_url,
        "headline": headline_text,
        "publish_date": publish_date_text,
        "article_content": content_text,
        "category": category_name  
    }

article_details = []

article_links = []
for subcategory in subcategory_links:
    subcategory_url = subcategory["url"]
    response = requests.get(subcategory_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    article_a_tags = soup.find_all(attrs={"class": ["dcr-2yd10d", ""], "href": True})
    for a_tag in article_a_tags:
        if a_tag["href"] is not None:
          article_links.append(a_tag["href"])
    
    for j, article in enumerate(article_links):
        # this will get 10 articles for every subcategory
        if j > 10:
            break
        if article.startswith("#"): continue
        article_url = BASE_URL + article if article.startswith("/") else article
        print(article_url)
        if article_url.startswith("https://www.theguardian.com"):
          article_data = scrape_article_details(article_url, subcategory["category"])
          article_details.append(article_data)

df = pd.DataFrame(article_details)
df.to_excel("guardian_articles.xlsx", index=False)

https://www.theguardian.com/world/live/2025/mar/18/israel-gaza-live-blog-updates-air-strikes-strip-netanyahu-hamas
https://www.theguardian.com/world/2025/mar/18/israel-gaza-strikes-deaths-latest-update
https://www.theguardian.com/world/2025/mar/18/un-human-rights-chief-voices-horror-at-israel-new-gaza-strikes
https://www.theguardian.com/world/2025/mar/18/why-has-israel-resumed-large-scale-airstrikes-on-gaza
https://www.theguardian.com/world/2025/mar/18/trump-and-putin-to-hold-high-stakes-call-on-ukraine
https://www.theguardian.com/science/2025/mar/18/nasa-astronauts-iss-spacex-dragon-capsule-return-time-sunita-williams-barry-wilmore
https://www.theguardian.com/world/2025/mar/18/friedrich-merz-bundestag-vote-germany-borrowing-plan
https://www.theguardian.com/us-news/2025/mar/18/canadian-actor-jasmine-mooney-detained-mexico-border
https://www.theguardian.com/sport/2025/mar/17/conor-mcgregor-anti-immigration-rant-white-house-condemned-by-irish-pm
https://www.theguardian.com/us-news/2025/m