In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import urllib.robotparser
import csv
import time
from requests.models import PreparedRequest

def scrape_brandenburg_police(params):
    base_url = "https://polizei.brandenburg.de"
    url_pattern = f"{base_url}/suche/typ/null/{{page}}/1"
    req = PreparedRequest()
    req.prepare_url(url_pattern.format(page=1), params)
    test_url = req.url
    print("Test URL:", test_url)
    
    rp = urllib.robotparser.RobotFileParser()
    rp.set_url(urljoin(base_url, "robots.txt"))
    rp.read()
    if not rp.can_fetch("*", test_url):
        print(f"Scraping disallowed for URL: {test_url}")
        return
    session = requests.Session()
    session.headers.update({"User-Agent": "Mozilla/5.0"})
    results = []
    page = 1
    
    while True:
        url = url_pattern.format(page=page)
        response = session.get(url, params=params)
        if response.status_code != 200:
            break
        soup = BeautifulSoup(response.text, 'html.parser')
        ul = soup.find("ul", class_=lambda x: x and "pbb-searchlist" in x)
        if not ul:
            break
        items = ul.find_all("li")
        if not items:
            break
        for li in items:
            h4 = li.find("h4")
            a = h4.find("a") if h4 else None
            if a:
                strong = a.find("strong")
                title = strong.get_text(strip=True) if strong else a.get_text(strip=True)
                article_url = urljoin(base_url, a.get("href"))
                art_response = session.get(article_url)
                if art_response.status_code != 200:
                    text = ""
                    location = ""
                else:
                    art_soup = BeautifulSoup(art_response.text, 'html.parser')
                    content = art_soup.find("div", class_="pbb-article-text")
                    text = content.get_text(separator="\n", strip=True) if content else ""
                    ort_tag = art_soup.find("p", class_="pbb-ort")
                    landkreis_tag = art_soup.find("p", class_="pbb-landkreis")
                    location = ""
                    if ort_tag:
                        location = ort_tag.get_text(strip=True)
                    if landkreis_tag:
                        location = f"{location}, {landkreis_tag.get_text(strip=True)}" if location else landkreis_tag.get_text(strip=True)
            else:
                title = ""
                text = ""
                location = ""
            p = li.find("p")
            span = p.find("span") if p else None
            if span:
                span_text = span.get_text(separator=" ", strip=True)
                date = span_text.split("Artikel vom")[-1].strip().split()[0] if "Artikel vom" in span_text else ""
            else:
                date = ""
            results.append({"title": title, "date": date, "location": location, "text": text})
        page += 1
        time.sleep(3)
    with open('brandenburg_police_results.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(["Title", "Date", "Location", "Text"])
        for r in results:
            writer.writerow([r["title"], r["date"], r["location"], r["text"]])

Test URL: https://polizei.brandenburg.de/suche/typ/null/1/1?search%5Bquery%5D=&search%5Bregion%5D=&search%5Bdienststelle%5D=&search%5Bkategorien%5D=Kriminalit%C3%A4t&search%5BonlineDateFrom%5D=01.02.2025&search%5BdocType%5D=&search%5Btags%5D=&search%5Bzeitraum%5D=&search%5BonlineDateTo%5D=&search%5BsearchButton2%5D=Suchen+%C2%BB


In [None]:
query_params = {
        "search[query]": "",
        "search[region]": "",
        "search[dienststelle]": "",
        "search[kategorien]": "Kriminalität",
        "search[onlineDateFrom]": "01.02.2025",
        "search[docType]": "",
        "search[tags]": "",
        "search[zeitraum]": "",
        "search[onlineDateTo]": "",
        "search[searchButton2]": "Suchen »"
    }

scrape_brandenburg_police(query_params)