In [1]:
#code for scrapping welt

In [None]:
import requests
import time
import json
import re
import pandas as pd
from bs4 import BeautifulSoup
import spacy
nlp = spacy.load("de_core_news_md")
CLIMATE_KEYWORDS = ["klima",  "klimakrise", "klimawandel", "erderwärmung", "globale erwärmung", "treibhauseffekt", "treibhausgas",
    "co2", "kohlendioxid", "emission", "emissionen", "energiewende", "erneuerbare energien", "klimaschutz", "klimapolitik",
    "hitzewelle", "dürre", "hochwasser", "wasserknappheit", "starkregen", "waldbrand",
    "gletscherschmelze", "artensterben", "klimaneutral", "emissionshandel"]
STRICT_KEYWORDS = ["klima", "klimakrise", "klimawandel", "erderwärmung", "co2", "kohlendioxid", "emission", "emissionen",
    "energiewende", "klimaschutz", "klimaneutral", "treibhauseffekt", "treibhausgas", "hitzewelle", "dürre", "hochwasser",
    "wasserknappheit", "starkregen", "waldbrand"]
CLASSIFICATION_KEYWORDS = {
    "Climate_Policy": ["gesetz", "politik", "regierung", "beschluss", "verordnung", "ziel", "klimaziel", "bundestag", "eu", "parlament", "ministerium"],
    "Climate_Science": ["studie", "forschung", "wissenschaft", "ipcc", "daten", "analyse", "bericht", "modell", "forscher"],
    "Energy_Transition": ["energiewende", "erneuerbar", "solar", "windkraft", "kohlekraft", "atomkraft", "wasserstoff", "stromnetz"],
    "Climate_Economy": ["kosten", "industrie", "wirtschaft", "markt", "unternehmen", "investition", "preis", "arbeitsplätze"],
    "Climate_Activism": ["protest", "demonstration", "aktivisten", "fridays for future", "ngo", "bewegung"],
    "Climate_Impact": ["hitzewelle", "dürre", "hochwasser", "überschwemmung", "starkregen", "waldbrand", "extremwetter"],
    "Climate_Geopolitics": ["china", "usa", "eu", "russland", "international", "global", "weltweit", "g7", "g20"],
    "Climate_Opinion": ["meinung", "kommentar", "kolumne", "leitartikel", "debatte", "gastbeitrag"]}
TIME_WINDOW = "y4"
OFFSET_STEP = 10
OUTPUT_CSV = "scrapped_welt.csv"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
    "Accept": "application/json",
    "Referer": "https://www.welt.de/suche",}
SOURCE = "WELT"
LANGUAGE = "de"
def clean_text(text):
    if not text:
        return None
    return re.sub(r"\s+", " ", text).strip()
def strict_relevance(title, content):
    title = title.lower()
    content_l = content.lower()
    lead = " ".join(content_l.split()[:300])
    title_hit = any(k in title for k in STRICT_KEYWORDS)
    lead_hits = sum(k in lead for k in STRICT_KEYWORDS)
    return title_hit or lead_hits >= 3
def extract_jsonld(soup):
    for script in soup.find_all("script", type="application/ld+json"):
        try:
            data = json.loads(script.string)
        except:
            continue
        if isinstance(data, dict) and data.get("@type") == "NewsArticle":
            content = clean_text(data.get("articleBody"))
            headline = clean_text(data.get("headline"))
            author = None
            auth = data.get("author")
            if isinstance(auth, dict):
                author = auth.get("name")
            elif isinstance(auth, list) and auth:
                author = auth[0].get("name")
            return content, headline, clean_text(author)
    return None, None, None
def extract_fallback_content(soup):
    texts = []
    for p in soup.find_all("p"):
        t = p.get_text().strip()
        if len(t) < 80:
            continue
        if any(x in t.lower() for x in ["anzeige", "newsletter", "datenschutz", "jetzt abonnieren", "quelle:", "bild:"]):
            continue
        texts.append(t)
    content = clean_text(" ".join(texts))
    return content if content and len(content) > 800 else None
def analyze_text(text):
    doc = nlp(text)
    actors = sorted(set(ent.text for ent in doc.ents if ent.label_ in ["PER", "ORG"]))
    sentences = list(doc.sents)
    return actors, len(actors), len(sentences), len(text)
def classify_article(title, intro, content):
    text = f"{title} {intro} {' '.join(content.split()[:300])}".lower()
    scores = {}
    for cat, keywords in CLASSIFICATION_KEYWORDS.items():
        scores[cat] = sum(k in text for k in keywords)
    best = max(scores, key=scores.get)
    return best if scores[best] > 0 else "Other_Climate"
rows = []
seen_urls = set()
for keyword in CLIMATE_KEYWORDS:
    print(f"\nKeyword: {keyword}")
    offset = 0
    while True:
        api_url = f"https://www.welt.de/api/search/{keyword}"
        params = {"offset": offset, "restrictBy": TIME_WINDOW}
        try:
            r = requests.get(api_url, params=params, headers=HEADERS, timeout=15)
            data = r.json()
        except:
            break
        items = data.get("items", [])
        if not items:
            break
        for item in items:
            url = item.get("url")
            if not url or url in seen_urls:
                continue
            seen_urls.add(url)
            html = requests.get(url, headers=HEADERS, timeout=15).text
            soup = BeautifulSoup(html, "html.parser")
            content, headline, author = extract_jsonld(soup)
            if not content:
                content = extract_fallback_content(soup)
            if not headline:
                h1 = soup.find("h1")
                headline = clean_text(h1.get_text()) if h1 else None
            if not content or not headline:
                continue
            if not strict_relevance(headline, content):
                continue
            actors, actor_count, sent_count, length = analyze_text(content)
            article_class = classify_article(headline, item.get("intro"), content)
            rows.append({
                "URL": url,
                "Source": SOURCE,
                "Language": LANGUAGE,
                "Published_Date": item.get("publicationDate"),
                "Keyword_Matched": keyword,
                "Article_Classification": article_class,
                "Headline": headline,
                "Intro": clean_text(item.get("intro")),
                "Content": content,
                "Content_Length": length,
                "Sentence_Count": sent_count,
                "Actors": ", ".join(actors),
                "Actor_Count": actor_count,
                "Author": author
            })
            print("Saved")
            time.sleep(1)
        offset += OFFSET_STEP
        time.sleep(0.5)
df = pd.DataFrame(rows)
df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8-sig")
print(f"File saved: {OUTPUT_CSV}")
print(f"Total number of articles scrapped: {len(df)}")

In [2]:
#code for scrapping FAZ

In [None]:
#importing libraries
import time
import random
import re
import requests
import pandas as pd
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from webdriver_manager.chrome import ChromeDriverManager
import spacy
nlp = spacy.load("de_core_news_md")
INVALID_URL_PATTERNS = ["/video/", "/podcast/", "/audio/", "/bilder/", "/einspruch/", "/interaktiv/", "/explainer/",
    "fazarchiv.faz.net", "/payment/", "/abo/"]
MAX_RETRIES_PER_URL = 2
CLIMATE_KEYWORDS_DE = ["klima", "klimawandel", "klimakrise", "erderwärmung", "co2", "emission", "energiewende", "hochwasser",
    "dürre", "hitzewelle", "klimaschutz"]
CLASSIFICATION_KEYWORDS = {
    "Climate_Policy": ["gesetz", "politik", "regierung", "beschluss", "verordnung", "klimaziel", "bundestag","parlament", "ministerium", "regulierung"],
    "Climate_Science": ["studie", "forschung", "wissenschaft", "ipcc", "daten", "analyse", "bericht", "modell", "forscher"],
    "Energy_Transition": ["energiewende", "erneuerbar", "solar", "windkraft", "kohlekraft", "atomkraft", "wasserstoff", "stromnetz"],
    "Climate_Economy": ["kosten", "industrie", "wirtschaft", "markt", "unternehmen", "investition", "preis", "arbeitsplätze"],
    "Climate_Activism": ["protest", "demonstration", "aktivisten", "fridays for future", "ngo", "bewegung"],
    "Climate_Impact": ["hitzewelle", "dürre", "hochwasser", "überschwemmung", "starkregen", "waldbrand"],
    "Climate_Geopolitics": ["china", "usa", "russland", "international", "global", "weltweit", "g7", "g20", "eu"],
    "Climate_Opinion": ["meinung", "kommentar", "kolumne", "leitartikel", "debatte", "gastbeitrag"]}
FAZ_API = "https://www.faz.net/api/faz-content-search"
BASE_KEYWORDS = ["klima", "klimawandel", "klimakrise", "co2"]
MAX_PAGES = 5
MIN_YEAR = 2022
OUTPUT_FILE = "scrapped_faz.csv"
API_HEADERS = {
    "User-Agent": "Mozilla/5.0",
    "Accept": "application/json",
    "Referer": "https://www.faz.net/suche/"}
def extract_actors(text):
    doc = nlp(text[:4000])
    return sorted(set(ent.text for ent in doc.ents if ent.label_ in ("PER", "ORG")))
def sentence_count(text):
    return sum(1 for _ in nlp(text).sents)
chrome_options = Options()
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--disable-notifications")
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument("user-agent=Mozilla/5.0")
chrome_options.add_argument(r"--user-data-dir=C:/selenium_chrome_profile")
driver = None
wait = None
def start_driver():
    global driver, wait
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    wait = WebDriverWait(driver, 25)
def restart_driver():
    global driver
    try:
        driver.quit()
    except:
        pass
    start_driver()
start_driver()
def human_sleep(a=2, b=4):
    time.sleep(random.uniform(a, b))
def safe_get(url, timeout=20):
    try:
        driver.set_page_load_timeout(timeout)
        driver.get(url)
        return True
    except TimeoutException:
        return False
    except Exception as e:
        return False
def is_valid_year(api_doc):
    try:
        return int(api_doc["date"][:4]) >= MIN_YEAR
    except:
        return False
def is_climate_article(title, content):
    text = f"{title} {content}".lower()
    hits = sum(text.count(k) for k in CLIMATE_KEYWORDS_DE)
    return hits >= 3 or any(k in title.lower() for k in CLIMATE_KEYWORDS_DE)
def classify_article(title, content):
    text = f"{title} {content}".lower()
    scores = {k: sum(text.count(w) for w in v)
              for k, v in CLASSIFICATION_KEYWORDS.items()}
    best = max(scores, key=scores.get)
    return best if scores[best] > 0 else "Other_Climate"
def extract_article():
    paras = driver.find_elements(By.TAG_NAME, "p")
    clean = []
    for p in paras:
        t = p.text.strip()
        if len(t) < 60:
            continue
        if any(x in t.lower() for x in ["anzeige", "abo", "newsletter"]):
            continue
        clean.append(t)
    if len(clean) < 5:
        return None, None
    return clean[0], " ".join(clean)
def extract_author(api_doc):
    authors = api_doc.get("metis_authors_full_names")
    if isinstance(authors, list):
        return ", ".join(authors)
    if isinstance(authors, str):
        return authors
    return None
def extract_date():
    try:
        return driver.find_element(By.TAG_NAME, "time").text.strip()
    except:
        return None
url_to_meta = {}
for kw in BASE_KEYWORDS:
    for page in range(1, MAX_PAGES + 1):
        params = {
            "q": kw,
            "page": page,
            "rows": 20,
            "paid_content": "include",
            "sort_by": "date",
            "sort_order": "desc"}
        r = requests.get(FAZ_API, headers=API_HEADERS, params=params, timeout=20)
        docs = r.json().get("docs", [])
        for d in docs:
            if not is_valid_year(d):
                continue
            url = d.get("url")
            if not url or any(bad in url for bad in INVALID_URL_PATTERNS):
                continue
            url_to_meta.setdefault(url, {"keyword": kw, "api_doc": d})
        human_sleep(1, 2)
urls = list(url_to_meta.keys())
print(f"URLs collected: {len(urls)}")
rows = []
failed_urls = {}
print("Scraping FAZ articles!")
for idx, url in enumerate(urls, 1):
    meta = url_to_meta[url]
    try:
        if not safe_get(url):
            continue
        if "fazarchiv.faz.net" in driver.current_url:
            print("Archive redirect detected, skipping:", url)
            continue
        human_sleep(2, 4)
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "main")))
        headline = driver.find_element(By.TAG_NAME, "h1").text.strip()
        intro, content = extract_article()
        if not content or not is_climate_article(headline, content):
            continue
        rows.append({
            "URL": url,
            "Source": "FAZ",
            "Language": "de",
            "Published_Date": extract_date(),
            "Keyword_Matched": meta["keyword"],
            "Article_Classification": classify_article(headline, content),
            "Headline": headline,
            "Intro": intro,
            "Content": content,
            "Content_Length": len(content),
            "Sentence_Count": sentence_count(content),
            "Actors": ", ".join(extract_actors(content)),
            "Actor_Count": len(extract_actors(content)),
            "Author": extract_author(meta["api_doc"])
        })
        print(f"[{idx}/{len(urls)}] Saved:", headline[:60])
        if idx % 15 == 0:
            print("Normal cool down to avoid bot detection")
            human_sleep(10, 15)
    except Exception as e:
        failed_urls[url] = failed_urls.get(url, 0) + 1
        if failed_urls[url] >= MAX_RETRIES_PER_URL:
            print("Permanently skipping:", url)
            continue
        print("page was not able to load. Restarting the chromium driver.")
        restart_driver()
        human_sleep(5, 8)
        continue
columns = ["URL", "Source", "Language", "Published_Date", "Is_Premium",
    "Keyword_Matched", "Article_Classification", "Headline", "Intro", "Content", "Content_Length", "Sentence_Count", "Actors", "Actor_Count", "Author"]
df = pd.DataFrame(rows)[columns]
df.drop_duplicates(subset=["URL"], inplace=True)
df.to_csv(OUTPUT_FILE, index=False, encoding="utf-8-sig")
driver.quit()
print(f"File saved: {OUTPUT_FILE}")
print(f"Total number of Articles collected: {len(df)}")

In [None]:
#COde for spiegel

In [None]:
import requests
import time
import re
import pandas as pd
from bs4 import BeautifulSoup
import spacy
from datetime import datetime
nlp = spacy.load("de_core_news_md")
CLIMATE_KEYWORDS = ["klima", "klimakrise", "klimawandel", "erderwärmung", "globale erwärmung", "treibhauseffekt", "treibhausgas",
    "co2", "kohlendioxid", "emission", "emissionen", "energiewende", "erneuerbare energien", "klimaschutz", "klimapolitik",
    "hitzewelle", "dürre", "hochwasser", "wasserknappheit", "starkregen", "waldbrand", "gletscherschmelze", "artensterben",
    "klimaneutral", "emissionshandel"]
SEARCH_API = "https://www.spiegel.de/services/sitesearch/search"
SEGMENTS = ",".join(["spon", "spon_paid", "spon_international", "mmo", "mmo_paid", "hbm", "hbm_paid", "elf", "elf_paid", "effi", "effi_paid"])
MAX_PAGES = 10 
PAGE_SIZE = 20
OUTPUT_CSV = "scrapped_spiegel.csv"
HEADERS = {
    "User-Agent": "Mozilla/5.0",
    "Accept": "application/json"}
SOURCE = "SPIEGEL"
LANGUAGE = "de"
def clean_text(text):
    if not text or not isinstance(text, str):
        return None
    return re.sub(r"\s+", " ", text).strip()
def keyword_hits(text):
    text = str(text).lower()
    return sum(k in text for k in CLIMATE_KEYWORDS)
def extract_author(soup):
    tag = soup.select_one('a[href*="/impressum/autor"]')
    return tag.get_text(strip=True) if tag else None
def extract_published_date(soup):
    tag = soup.select_one("time.timeformat")
    return tag.get("datetime") if tag else None
def is_boilerplate(text):
    if not text:
        return True
    bad = ["print-abo", "spiegel+", "digital-zugang", "jetzt weiterlesen", "abo abschließen"]
    return any(b in text.lower() for b in bad)
def analyze_text(text):
    doc = nlp(text)
    actors = sorted(set(ent.text for ent in doc.ents if ent.label_ in ["PER", "ORG"]))
    sentences = list(doc.sents)
    return actors, len(actors), len(sentences), len(text)
def classify_article(text):
    t = text.lower()
    if any(k in t for k in ["gesetz", "regierung", "bundestag", "politik", "eu"]):
        return "Climate_Policy"
    if any(k in t for k in ["studie", "forschung", "ipcc"]):
        return "Climate_Science"
    if any(k in t for k in ["energie", "wind", "solar", "strom"]):
        return "Energy_Transition"
    if any(k in t for k in ["wirtschaft", "industrie", "markt"]):
        return "Climate_Economy"
    if any(k in t for k in ["protest", "aktivisten", "demonstration"]):
        return "Climate_Activism"
    if any(k in t for k in ["hitzewelle", "dürre", "hochwasser", "waldbrand"]):
        return "Climate_Impact"
    return "Other_Climate"
rows = []
seen_urls = set()
for keyword in CLIMATE_KEYWORDS:
    print(f"\nkeyword: {keyword}")
    for page in range(1, MAX_PAGES + 1):
        print(f"Page {page}")
        params = {
            "q": keyword,
            "page": page,
            "page_size": PAGE_SIZE,
            "segments": SEGMENTS,
            "after": -2208988800,
            "before": int(datetime.now().timestamp())}
        r = requests.get(SEARCH_API, params=params, headers=HEADERS, timeout=15)
        data = r.json()
        items = data.get("results", [])
        if not items:
            break
        for item in items:
            url = item.get("url")
            if not url or url in seen_urls:
                continue
            seen_urls.add(url)
            title = clean_text(item.get("title"))
            intro = clean_text(item.get("teaser") or item.get("intro"))
            html = requests.get(url, headers=HEADERS, timeout=15).text
            soup = BeautifulSoup(html, "html.parser")
            body = soup.find("div", attrs={"data-area": "body"})
            if not body:
                continue
            paragraphs = [
                p.get_text().strip()
                for p in body.find_all("p")
                if len(p.get_text().strip()) > 60]
            content = clean_text(" ".join(paragraphs))
            if not content or len(content) < 1000:
                continue
            if is_boilerplate(content):
                continue
            relevance = keyword_hits(f"{title} {intro} {content}")
            if relevance < 2:
                continue
            author = extract_author(soup)
            pub_date = extract_published_date(soup)
            is_premium = bool(soup.select_one("[data-area='Paywall']"))
            actors, actor_count, sent_count, length = analyze_text(content)
            article_class = classify_article(content)
            rows.append({
                "URL": url,
                "Source": SOURCE,
                "Language": LANGUAGE,
                "Published_Date": pub_date,
                "Keyword_Matched": keyword,
                "Article_Classification": article_class,
                "Headline": title,
                "Intro": intro,
                "Content": content,
                "Content_Length": length,
                "Sentence_Count": sent_count,
                "Actors": ", ".join(actors),
                "Actor_Count": actor_count,
                "Author": author
            })
            time.sleep(1)
df = pd.DataFrame(rows)
df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8-sig")
print(f"File saved: {OUTPUT_CSV}")
print(f"Total number of climate articles saved: {len(df)}")

In [None]:
#Code for Sud deutsche scrapping

In [None]:
import requests
import time
import json
import re
import pandas as pd
import spacy
from bs4 import BeautifulSoup
nlp = spacy.load("de_core_news_md")
CLIMATE_KEYWORDS = ["klima", "klimakrise", "klimawandel", "erderwärmung", "globale erwärmung", "treibhauseffekt", "treibhausgas",
    "co2", "kohlendioxid", "emission", "emissionen", "energiewende", "erneuerbare energien", "klimaschutz", "klimapolitik",
    "hitzewelle", "dürre", "hochwasser", "wasserknappheit", "starkregen", "waldbrand", "gletscherschmelze", "artensterben",
    "klimaneutral", "emissionshandel"]
STRICT_KEYWORDS = ["klima", "klimakrise", "klimawandel", "erderwärmung", "co2", "kohlendioxid", "emission", "emissionen",
    "energiewende", "klimaschutz", "klimaneutral", "treibhauseffekt", "treibhausgas",
    "hitzewelle", "dürre", "hochwasser", "wasserknappheit", "starkregen", "waldbrand"]
CLASSIFICATION_KEYWORDS = {
    "Climate_Policy": ["politik", "regierung", "gesetz", "bundestag", "eu"],
    "Climate_Science": ["studie", "forschung", "wissenschaft", "ipcc"],
    "Energy_Transition": ["energiewende", "solar", "windkraft", "wasserstoff"],
    "Climate_Economy": ["wirtschaft", "industrie", "kosten", "preis"],
    "Climate_Impact": ["hitzewelle", "dürre", "hochwasser", "waldbrand"],
    "Climate_Opinion": ["meinung", "kommentar", "kolumne"]}
SOURCE = "SUEDDEUTSCHE_ZEITUNG"
LANGUAGE = "de"
OUTPUT_CSV = "scrapped_sd.csv"
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
PAGES = range(1, 11)
def clean_text(text):
    if not text:
        return None
    return re.sub(r"\s+", " ", text).strip()
def strict_relevance(title, content):
    t = title.lower()
    c = content.lower()
    lead = " ".join(c.split()[:300])
    return any(k in t for k in STRICT_KEYWORDS) or sum(k in lead for k in STRICT_KEYWORDS) >= 3
def analyze_text(text):
    doc = nlp(text)
    actors = sorted(set(ent.text for ent in doc.ents if ent.label_ in ["PER", "ORG"]))
    return actors, len(actors), len(list(doc.sents)), len(text)
def classify_article(title, intro, content):
    text = f"{title} {intro or ''} {' '.join(content.split()[:300])}".lower()
    scores = {k: sum(w in text for w in v) for k, v in CLASSIFICATION_KEYWORDS.items()}
    best = max(scores, key=scores.get)
    return best if scores[best] > 0 else "Other_Climate"
def extract_article(url):
    html = requests.get(url, headers=HEADERS, timeout=15).text
    soup = BeautifulSoup(html, "html.parser")
    headline = None
    content = None
    author = None
    pub_date = None
    for script in soup.find_all("script", type="application/ld+json"):
        try:
            data = json.loads(script.string)
        except:
            continue
        if isinstance(data, dict) and data.get("@type") == "NewsArticle":
            headline = clean_text(data.get("headline"))
            content = clean_text(data.get("articleBody"))
            pub_date = data.get("datePublished")
            auth = data.get("author")
            if isinstance(auth, dict):
                author = auth.get("name")
            elif isinstance(auth, list) and auth:
                author = auth[0].get("name")
            break
    if not content:
        texts = [p.get_text(strip=True) for p in soup.find_all("p") if len(p.get_text(strip=True)) > 80]
        content = clean_text(" ".join(texts))
    if not headline:
        h1 = soup.find("h1")
        headline = clean_text(h1.get_text()) if h1 else None
    return headline, content, author, pub_date
rows = []
for keyword in CLIMATE_KEYWORDS:
    print(f"\nKeyword: {keyword}")
    for page in PAGES:
        r = requests.get("https://www.sueddeutsche.de/suche", params={"q": keyword, "page": page}, headers=HEADERS, timeout=15)
        soup = BeautifulSoup(r.text, "html.parser")
        links = soup.select("a[href^='https://www.sueddeutsche.de']")
        if not links:
            break
        for a in links:
            url = a.get("href")
            if not url:
                continue
            try:
                headline, content, author, pub_date = extract_article(url)
            except:
                continue
            if not headline or not content:
                continue
            if not strict_relevance(headline, content):
                continue
            actors, actor_count, sent_count, length = analyze_text(content)
            article_class = classify_article(headline, None, content)
            rows.append({
                "URL": url,
                "Source": SOURCE,
                "Language": LANGUAGE,
                "Published_Date": pub_date,
                "Keyword_Matched": keyword,
                "Article_Classification": article_class,
                "Headline": headline,
                "Intro": None,
                "Content": content,
                "Content_Length": length,
                "Sentence_Count": sent_count,
                "Actors": ", ".join(actors),
                "Actor_Count": actor_count,
                "Author": author
            })
            print("Saved")
            time.sleep(0.6)
df = pd.DataFrame(rows)
df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8-sig")
print(f"File saved: {OUTPUT_CSV}")
print(f"Total number of climate articles saved: {len(df)}")

In [None]:
#Code for scrapping BZ

In [None]:
import time
import random
import re
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import spacy

nlp = spacy.load("de_core_news_md")
BASE_SEARCH = "https://www.bz-berlin.de/suche"
BASE_KEYWORDS = ["Klima", "Klimawandel", "Klimakrise", "CO2", "Emissionen", "Energiewende", "Hochwasser", "Dürre", "Hitzewelle", "Klimaprotest",
    "Klimapolitik"]
INVALID_URL_PATTERNS = ["/video/", "/bilder/", "/spiele/", "/angebote/", "/shopping/"]
CLIMATE_KEYWORDS_DE = ["klima", "klimawandel", "klimakrise", "erderwärmung", "co2", "emission", "energiewende", "hochwasser",
    "dürre", "hitzewelle", "treibhaus", "klimapolitik", "klimaschutz", "klimaaktiv", "klimaprotest", "letzte generation"]
YEARS = range(2022, 2027)
OUTPUT_FILE = "scrapped_bz.csv"
def extract_actors(text):
    doc = nlp(text[:4000])
    return sorted(set(
        ent.text for ent in doc.ents
        if ent.label_ in ("PER", "ORG")))
def sentence_count(text):
    return sum(1 for _ in nlp(text).sents)
chrome_options = Options()
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64)")
chrome_options.add_argument(r"--user-data-dir=C:/selenium_chrome_profile")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
wait = WebDriverWait(driver, 20)
def human_sleep(a=3, b=6):
    time.sleep(random.uniform(a, b))
def safe_get(url):
    try:
        driver.get(url)
        human_sleep(2, 4)
        return True
    except:
        return False
def extract_date():
    try:
        return driver.find_element(By.CSS_SELECTOR, "div.entry-meta small").text.strip()
    except:
        return None
def extract_author():
    try:
        author = driver.find_element(By.CSS_SELECTOR, "a[rel='author']")
        return author.text.strip()
    except:
        pass
    try:
        container = driver.find_element(By.CSS_SELECTOR, 'div[data-type="author"]')
        span = container.find_element(By.TAG_NAME, "span")
        return span.text.strip()
    except:
        pass
    return None
def extract_article():
    paragraphs = driver.find_elements(By.TAG_NAME, "p")
    clean = []
    for p in paragraphs:
        t = p.text.strip()
        if len(t) < 60:
            continue
        if any(x in t.lower() for x in ["anzeige", "newsletter", "abonnieren", "quelle:", "bild:"]):
            continue
        clean.append(t)
    if len(clean) < 5:
        return None, None
    intro = clean[0]
    content = re.sub(r"\s+", " ", " ".join(clean))
    return intro, content
def is_climate_article(title, content):
    title = title.lower()
    content = content.lower()
    if any(x in title for x in ["horoskop", "quiz", "promi"]):
        return False
    keyword_hits = sum(content.count(k) for k in CLIMATE_KEYWORDS_DE)
    strong_signals = ["klimapolitik", "klimaschutz", "klimaaktiv", "klimaprotest", "energiewende", "co2-preis", "letzte generation", "klima-demo"]
    strong_hit = any(s in content for s in strong_signals)
    return (keyword_hits >= 3 or strong_hit or any(k in title for k in CLIMATE_KEYWORDS_DE))
def derive_climate_section(title, content):
    text = f"{title} {content}".lower()
    if any(k in text for k in ["gesetz", "regierung", "bundestag", "verordnung", "klimapolitik", "ziel"]):
        return "Climate_Policy"
    if any(k in text for k in ["studie", "forschung", "wissenschaftler", "ipcc", "daten", "bericht"]):
        return "Climate_Science"
    if any(k in text for k in ["energiewende", "erneuerbar", "solar", "wind", "kohlekraft", "gas", "strom"]):
        return "Energy_Transition"
    if any(k in text for k in ["kosten", "preise", "wirtschaft", "industrie", "markt", "subvention"]):
        return "Climate_Economy"
    if any(k in text for k in ["klimaprotest", "demonstration", "aktivisten", "letzte generation", "blockade", "ngo"]):
        return "Climate_Activism"
    if any(k in text for k in ["hochwasser", "überschwemmung", "dürre", "hitzewelle", "katastrophe"]):
        return "Climate_Impact"
    if any(k in text for k in ["eu", "china", "usa","international", "global"]):
        return "Climate_Geopolitics"
    if any(k in text for k in ["meinung", "kommentar", "kolumne", "leitartikel"]):
        return "Climate_Opinion"
    return "Other_Climate"
rows = []
visited = set()
for base_keyword in BASE_KEYWORDS:
    for year in YEARS:
        search_term = f"{base_keyword} {year}"
        search_url = f"{BASE_SEARCH}?q={search_term}"
        if not safe_get(search_url):
            continue
        articles = driver.find_elements(By.CSS_SELECTOR, "article a[href]")
        urls = []
        for a in articles:
            url = a.get_attribute("href")
            if not url:
                continue
            if not url.startswith("https://www.bz-berlin.de"):
                continue
            if any(bad in url for bad in INVALID_URL_PATTERNS):
                continue
            if url in visited:
                continue
            visited.add(url)
            urls.append(url)
        for url in urls:
            if not safe_get(url):
                continue
            try:
                wait.until(EC.presence_of_element_located((By.TAG_NAME, "h1")))
                headline = driver.find_element(By.TAG_NAME, "h1").text.strip()
                intro, content = extract_article()
                if not content:
                    continue
                if not is_climate_article(headline, content):
                    continue
                actors = extract_actors(content)
                derived_section = derive_climate_section(headline, content)
                rows.append({
                    "URL": url,
                    "Source": "B.Z.",
                    "Language": "de",
                    "Published_Date": extract_date(),
                    "Article_Classification": derived_section,
                    "Keyword_Matched": search_term,
                    "Headline": headline,
                    "Intro": intro,
                    "Content": content,
                    "Content_Length": len(content),
                    "Sentence_Count": sentence_count(content),
                    "Actors": ", ".join(actors),
                    "Actor_Count": len(actors),
                    "Author": extract_author()
                })
                print("Saved:", headline[:60], "| Author:", extract_author())
                human_sleep()
            except:
                continue
df = pd.DataFrame(rows)
df.drop_duplicates(subset=["URL"], inplace=True)
df.to_csv(OUTPUT_FILE, index=False, encoding="utf-8-sig")
driver.quit()
print(f"File: {OUTPUT_FILE}")
print(f"Total number of Articles scrapped: {len(df)}")

In [None]:
#code for scrapping die ziet

In [None]:
import time
import random
import re
import hashlib
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import spacy
nlp = spacy.load("de_core_news_md")
BASE_KEYWORDS = ["Klima", "Klimawandel", "Klimakrise", "CO2", "Emissionen", "Energiewende", "Hochwasser", "Dürre", "Hitzewelle",
    "Klimaprotest", "Klimapolitik"]
INVALID_URL_PATTERNS = [ "/video/", "/angebote/", "/spiele/", "/campus/", "/index"]
CLIMATE_KEYWORDS_DE = ["klima", "klimawandel", "klimakrise", "erderwärmung", "co2", "emission", "energiewende", "hochwasser",
    "dürre", "hitzewelle", "treibhaus", "klimapolitik", "klimaschutz", "klimaaktiv", "klimaprotest", "letzte generation"]
STRONG_CLIMATE_TERMS = ["klimawandel", "klimakrise", "erderwärmung", "klimaschutz", "klimapolitik", "klimaprotest", "klimaaktivisten",
    "letzte generation", "co2-preis", "emissionshandel", "klimaziele", "pariser abkommen", "ipcc"]
BASE_SEARCH = "https://www.zeit.de/suche/index"
YEARS = range(2022, 2026)
PAGES = range(1, 99)
OUTPUT_FILE = "scrapped_zeit.csv"
def extract_actors(text):
    doc = nlp(text[:4000])
    return sorted(set(
        ent.text for ent in doc.ents
        if ent.label_ in ("PER", "ORG")))
def sentence_count(text):
    return sum(1 for _ in nlp(text).sents)
chrome_options = Options()
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64)")
chrome_options.add_argument(r"--user-data-dir=C:/selenium_chrome_profile")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
wait = WebDriverWait(driver, 20)
def human_sleep(a=2, b=4):
    time.sleep(random.uniform(a, b))
def safe_get(url):
    try:
        driver.get(url)
        human_sleep()
        return True
    except:
        return False
def extract_date():
    try:
        return driver.find_element(By.TAG_NAME, "time").get_attribute("datetime")
    except:
        return None
def extract_author():
    try:
        authors = driver.find_elements(
            By.CSS_SELECTOR,
            'a[rel="author"] span[itemprop="name"]'
        )
        return ", ".join(a.text.strip() for a in authors)
    except:
        return None
def extract_article():
    paragraphs = driver.find_elements(By.CSS_SELECTOR, "article p")
    clean = []
    for p in paragraphs:
        text = p.text.strip()
        if len(text) < 70:
            continue
        if any(x in text.lower() for x in ["anzeige", "newsletter", "abonnieren"]):
            continue
        clean.append(text)
    if len(clean) < 6:
        return None, None
    intro = clean[0]
    content = re.sub(r"\s+", " ", " ".join(clean))
    return intro, content
def is_climate_article(title, content):
    title_l = title.lower()
    content_l = content.lower()
    if any(x in title_l for x in ["newsletter", "horoskop", "quiz", "sport", "kultur", "reise"]):
        return False
    keyword_hits = sum(content_l.count(k) for k in CLIMATE_KEYWORDS_DE)
    strong_hits = sum(1 for k in STRONG_CLIMATE_TERMS if k in content_l or k in title_l)
    density = keyword_hits / max(len(content_l), 1)
    return (strong_hits >= 2 or (strong_hits >= 1 and keyword_hits >= 10) or ("klima" in title_l and keyword_hits >= 8 and density >= 0.001))
def derive_climate_section(title, content):
    text = f"{title} {content}".lower()
    if any(k in text for k in ["gesetz", "regierung", "bundestag"]):
        return "Climate_Policy"
    if any(k in text for k in ["studie", "ipcc", "forschung"]):
        return "Climate_Science"
    if any(k in text for k in ["energie", "wind", "solar", "strom"]):
        return "Energy_Transition"
    if any(k in text for k in ["wirtschaft", "preise", "markt"]):
        return "Climate_Economy"
    if any(k in text for k in ["protest", "aktivisten"]):
        return "Climate_Activism"
    if any(k in text for k in ["hochwasser", "dürre", "hitzewelle"]):
        return "Climate_Impact"
    if any(k in text for k in ["eu", "usa", "china"]):
        return "Climate_Geopolitics"
    if any(k in text for k in ["meinung", "kommentar"]):
        return "Climate_Opinion"
    return "Other_Climate"
rows = []
visited_urls = set()
visited_hashes = set()
for keyword in BASE_KEYWORDS:
    for year in YEARS:
        query = f"{keyword} {year}"
        for page in PAGES:
            search_url = f"{BASE_SEARCH}?p={page}&q={query}"
            print(f"{query} | Page {page}")
            if not safe_get(search_url):
                continue
            links = driver.find_elements(By.CSS_SELECTOR, "article a[href]")
            urls = []
            for a in links:
                try:
                    href = a.get_attribute("href")
                    if not href:
                        continue
                    if not href.startswith("https://www.zeit.de/"):
                        continue
                    if any(bad in href for bad in INVALID_URL_PATTERNS):
                        continue
                    if href in visited_urls:
                        continue
                    visited_urls.add(href)
                    urls.append(href)
                except:
                    continue
            for url in urls:
                if not safe_get(url):
                    continue
                try:
                    wait.until(EC.presence_of_element_located((By.TAG_NAME, "h1")))
                    headline = driver.find_element(By.TAG_NAME, "h1").text.strip()
                    intro, content = extract_article()
                    if not content:
                        continue
                    if not is_climate_article(headline, content):
                        continue
                    content_hash = hashlib.md5(
                        content[:1000].encode("utf-8")
                    ).hexdigest()
                    if content_hash in visited_hashes:
                        continue
                    visited_hashes.add(content_hash)
                    actors = extract_actors(content)
                    rows.append({
                        "URL": url,
                        "Source": "DIE ZEIT",
                        "Language": "de",
                        "Published_Date": extract_date(),
                        "Derived_Section": derive_climate_section(headline, content),
                        "Keyword_Matched": query,
                        "Headline": headline,
                        "Intro": intro,
                        "Content": content,
                        "Content_Length": len(content),
                        "Sentence_Count": sentence_count(content),
                        "Actors": ", ".join(actors),
                        "Actor_Count": len(actors),
                        "Author": extract_author()
                    })
                    print("Saved:", headline[:70])
                    human_sleep()
                except:
                    continue
df = pd.DataFrame(rows)
df.drop_duplicates(subset=["URL"], inplace=True)
df.to_csv(OUTPUT_FILE, index=False, encoding="utf-8-sig")
driver.quit()
print(f"Saved File: {OUTPUT_FILE}")
print(f"Total number of articles scrapped: {len(df)}")