In [5]:
import feedparser
import pandas as pd
from datetime import datetime
import requests
from bs4 import BeautifulSoup

def get_full_description(url):
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        paragraphs = soup.select("div.rich-text p, .bodyTxt p, .group p")
        full_text = "\n".join(p.get_text(strip=True) for p in paragraphs)
        return full_text
    except Exception as e:
        return f"Failed to fetch full text: {str(e)}"

def scrape_dw_rss(rss_urls):
    articles = []
    for feed_url in rss_urls:
        print(f"🔍 Parsing feed: {feed_url}")
        feed = feedparser.parse(feed_url)
        for entry in feed.entries:
            title = entry.title
            url = entry.link
            description = BeautifulSoup(entry.get("summary", ""), "html.parser").get_text()
            date = entry.get("published", "N/A")

            full_text = get_full_description(url)

            articles.append({
                "Title": title,
                "Url": url,
                "Date": date,
                "Description": full_text or description
            })
    return articles

# ✅ List of RSS feeds to scrape (add more if needed)
rss_feeds = [
    "https://rss.dw.com/rdf/rss-en-world",
    "https://rss.dw.com/rdf/rss-en-europe",
    "https://rss.dw.com/rdf/rss-en-germany",
    "https://rss.dw.com/rdf/rss-en-business",
    "https://rss.dw.com/rdf/rss-en-politics",
    "https://rss.dw.com/rdf/rss-en-science",
    "https://rss.dw.com/rdf/rss-en-top-stories",
]

data = scrape_dw_rss(rss_feeds)
df = pd.DataFrame(data)
df = df[["Title", "Url", "Date", "Description"]]
df.to_csv("dw_articles_full.csv", index=False, encoding="utf-8-sig")

print(f"\n✅ Done! Saved {len(df)} articles to 'dw_articles_full.csv'")


🔍 Parsing feed: https://rss.dw.com/rdf/rss-en-world
🔍 Parsing feed: https://rss.dw.com/rdf/rss-en-europe
🔍 Parsing feed: https://rss.dw.com/rdf/rss-en-germany
🔍 Parsing feed: https://rss.dw.com/rdf/rss-en-business
🔍 Parsing feed: https://rss.dw.com/rdf/rss-en-politics
🔍 Parsing feed: https://rss.dw.com/rdf/rss-en-science
🔍 Parsing feed: https://rss.dw.com/rdf/rss-en-top-stories

✅ Done! Saved 22 articles to 'dw_articles_full.csv'


In [6]:
import feedparser
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime

# 🔍 Get full article description from URL
def get_full_description(url):
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        paragraphs = soup.select("div.rich-text p, .bodyTxt p, .group p")
        full_text = " ".join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))
        return full_text
    except Exception as e:
        return f"Failed to fetch full text: {str(e)}"

# 📥 Scrape from list of DW RSS feeds
def scrape_dw_rss(rss_urls):
    articles = []
    for feed_url in rss_urls:
        print(f"🔍 Reading feed: {feed_url}")
        feed = feedparser.parse(feed_url)
        for entry in feed.entries:
            title = entry.get("title", "").replace('\n', ' ').strip()
            url = entry.get("link", "").strip()
            summary = BeautifulSoup(entry.get("summary", ""), "html.parser").get_text().strip()
            date = entry.get("published", "N/A")

            full_text = get_full_description(url)

            articles.append({
                "Title": title,
                "Url": url,
                "Date": date,
                "Description": full_text if full_text else summary
            })
    return articles

# ✅ RSS Feeds
rss_feeds = [
    "https://rss.dw.com/rdf/rss-en-world",
    "https://rss.dw.com/rdf/rss-en-europe",
    "https://rss.dw.com/rdf/rss-en-germany",
    "https://rss.dw.com/rdf/rss-en-business",
    "https://rss.dw.com/rdf/rss-en-politics",
    "https://rss.dw.com/rdf/rss-en-science",
    "https://rss.dw.com/rdf/rss-en-top-stories"
]

# 🚀 Run
data = scrape_dw_rss(rss_feeds)

# 🧹 Clean DataFrame
df = pd.DataFrame(data)
df = df[["Title", "Url", "Date", "Description"]]
df["Description"] = df["Description"].str.replace('\n', ' ').str.replace('\r', ' ').str.strip()

# 💾 Save CSV (Excel-friendly)
df.to_csv("dw_articles_clean.csv", index=False, encoding="utf-8-sig", quoting=1)  # quoting=1 means QUOTE_ALL

print(f"\n✅ Done! {len(df)} articles saved to 'dw_articles_clean.csv'")


🔍 Reading feed: https://rss.dw.com/rdf/rss-en-world
🔍 Reading feed: https://rss.dw.com/rdf/rss-en-europe
🔍 Reading feed: https://rss.dw.com/rdf/rss-en-germany
🔍 Reading feed: https://rss.dw.com/rdf/rss-en-business
🔍 Reading feed: https://rss.dw.com/rdf/rss-en-politics
🔍 Reading feed: https://rss.dw.com/rdf/rss-en-science
🔍 Reading feed: https://rss.dw.com/rdf/rss-en-top-stories

✅ Done! 22 articles saved to 'dw_articles_clean.csv'


In [7]:
import feedparser
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime

# 🔍 Get full article description from URL
def get_full_description(url):
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        paragraphs = soup.select("div.rich-text p, .bodyTxt p, .group p")
        full_text = " ".join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))
        return full_text
    except Exception as e:
        return f"Failed to fetch full text: {str(e)}"

# 🗓️ Extract date with fallback
def extract_date(entry):
    for key in ['published', 'updated', 'dc:date']:
        if key in entry:
            try:
                return datetime(*entry.published_parsed[:6]).strftime("%Y-%m-%d")
            except Exception:
                return entry.get(key, "N/A")
    return "N/A"

# 📥 Scrape from DW RSS feeds
def scrape_dw_rss(rss_urls):
    articles = []
    for feed_url in rss_urls:
        print(f"🔍 Reading feed: {feed_url}")
        feed = feedparser.parse(feed_url)
        for entry in feed.entries:
            title = entry.get("title", "").replace('\n', ' ').strip()
            url = entry.get("link", "").strip()
            summary = BeautifulSoup(entry.get("summary", ""), "html.parser").get_text().strip()
            date = extract_date(entry)
            full_text = get_full_description(url)

            articles.append({
                "Title": title,
                "Url": url,
                "Date": date,
                "Description": full_text if full_text else summary
            })
    return articles

# ✅ RSS Feeds
rss_feeds = [
    "https://rss.dw.com/rdf/rss-en-world",
    "https://rss.dw.com/rdf/rss-en-europe",
    "https://rss.dw.com/rdf/rss-en-germany",
    "https://rss.dw.com/rdf/rss-en-business",
    "https://rss.dw.com/rdf/rss-en-politics",
    "https://rss.dw.com/rdf/rss-en-science",
    "https://rss.dw.com/rdf/rss-en-top-stories"
]

# 🚀 Run
data = scrape_dw_rss(rss_feeds)

# 🧹 Clean and structure
df = pd.DataFrame(data)
df = df[["Title", "Url", "Date", "Description"]]
df["Description"] = df["Description"].str.replace('\n', ' ').str.replace('\r', ' ').str.strip()

# 💾 Save to CSV
df.to_csv("dw_articles_with_dates.csv", index=False, encoding="utf-8-sig", quoting=1)

print(f"\n✅ Done! {len(df)} articles saved with dates to 'dw_articles_with_dates.csv'")


🔍 Reading feed: https://rss.dw.com/rdf/rss-en-world
🔍 Reading feed: https://rss.dw.com/rdf/rss-en-europe
🔍 Reading feed: https://rss.dw.com/rdf/rss-en-germany
🔍 Reading feed: https://rss.dw.com/rdf/rss-en-business
🔍 Reading feed: https://rss.dw.com/rdf/rss-en-politics
🔍 Reading feed: https://rss.dw.com/rdf/rss-en-science
🔍 Reading feed: https://rss.dw.com/rdf/rss-en-top-stories

✅ Done! 22 articles saved with dates to 'dw_articles_with_dates.csv'
