In [1]:
!pip install feedparser beautifulsoup4 pandas requests



In [2]:
import feedparser
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime


In [3]:
def get_full_description(url):
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        paragraphs = soup.select("div.rich-text p, .bodyTxt p, .group p")
        full_text = " ".join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))
        return full_text
    except Exception as e:
        return f"Failed to fetch full text: {str(e)}"

In [4]:
def extract_date(entry):
    for key in ['published', 'updated', 'dc:date']:
        if key in entry:
            try:
                return datetime(*entry.published_parsed[:6]).strftime("%Y-%m-%d")
            except Exception:
                return entry.get(key, "N/A")
    return "N/A"

In [5]:
def scrape_dw_rss(rss_urls):
    articles = []
    for feed_url in rss_urls:
        print(f"📡 Reading feed: {feed_url}")
        feed = feedparser.parse(feed_url)
        for entry in feed.entries:
            title = entry.get("title", "").replace('\n', ' ').strip()
            url = entry.get("link", "").strip()
            summary = BeautifulSoup(entry.get("summary", ""), "html.parser").get_text().strip()
            date = extract_date(entry)
            full_text = get_full_description(url)

            articles.append({
                "Title": title,
                "Url": url,
                "Date": date,
                "Description": full_text if full_text else summary
            })
    return articles

In [6]:
def scrape_dw_headlines():
    url = "https://www.dw.com/en/headlines/headlines-en"
    print(f"🌐 Scraping headlines from: {url}")
    articles = []

    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        items = soup.select("a.headline")

        for item in items:
            link = item.get("href")
            full_url = f"https://www.dw.com{link}" if link.startswith("/") else link
            title = item.get_text(strip=True)

            full_text = get_full_description(full_url)

            articles.append({
                "Title": title,
                "Url": full_url,
                "Date": datetime.today().strftime("%Y-%m-%d"),
                "Description": full_text
            })
    except Exception as e:
        print(f"❌ Failed to scrape headlines: {e}")

    return articles

In [7]:
# ✅ RSS Feeds
rss_feeds = [
    "https://rss.dw.com/rdf/rss-en-world",
    "https://rss.dw.com/rdf/rss-en-europe",
    "https://rss.dw.com/rdf/rss-en-germany",
    "https://rss.dw.com/rdf/rss-en-business",
    "https://rss.dw.com/rdf/rss-en-politics",
    "https://rss.dw.com/rdf/rss-en-science",
    "https://rss.dw.com/rdf/rss-en-top-stories"
]

# 🚀 Run both scrapers
rss_data = scrape_dw_rss(rss_feeds)
headlines_data = scrape_dw_headlines()

# 🧩 Combine both
combined_data = rss_data + headlines_data
df = pd.DataFrame(combined_data)

# 🧼 Clean and structure
df = df[["Title", "Url", "Date", "Description"]]
df["Description"] = df["Description"].str.replace('\n', ' ').str.replace('\r', ' ').str.strip()

# 💾 Save to CSV
df.to_csv("dw_all_articles_combined.csv", index=False, encoding="utf-8-sig", quoting=1)

print(f"\n✅ Done! {len(df)} articles saved to 'dw_all_articles_combined.csv'")


📡 Reading feed: https://rss.dw.com/rdf/rss-en-world
📡 Reading feed: https://rss.dw.com/rdf/rss-en-europe
📡 Reading feed: https://rss.dw.com/rdf/rss-en-germany
📡 Reading feed: https://rss.dw.com/rdf/rss-en-business
📡 Reading feed: https://rss.dw.com/rdf/rss-en-politics
📡 Reading feed: https://rss.dw.com/rdf/rss-en-science
📡 Reading feed: https://rss.dw.com/rdf/rss-en-top-stories
🌐 Scraping headlines from: https://www.dw.com/en/headlines/headlines-en

✅ Done! 25 articles saved to 'dw_all_articles_combined.csv'
