### Polizei Scraper

In [92]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import urllib.robotparser
import csv
import time
from requests.models import PreparedRequest

def scrape_brandenburg_police(params):
    base_url = "https://polizei.brandenburg.de"
    url_pattern = f"{base_url}/suche/typ/null/{{page}}/1"
    req = PreparedRequest()
    req.prepare_url(url_pattern.format(page=1), params)
    test_url = req.url
    print("Test URL:", test_url)
    
    rp = urllib.robotparser.RobotFileParser()
    rp.set_url(urljoin(base_url, "robots.txt"))
    rp.read()
    if not rp.can_fetch("*", test_url):
        print(f"Scraping disallowed for URL: {test_url}")
        return
    session = requests.Session()
    session.headers.update({"User-Agent": "Mozilla/5.0"})
    results = []
    page = 1
    while True:
        url = url_pattern.format(page=page)
        response = session.get(url, params=params)
        if response.status_code != 200:
            break
        soup = BeautifulSoup(response.text, 'html.parser')
        ul = soup.find("ul", class_=lambda x: x and "pbb-searchlist" in x)
        if not ul:
            break
        items = ul.find_all("li")
        if not items:
            break
        for li in items:
            article_url = ""
            h4 = li.find("h4")
            a = h4.find("a") if h4 else None
            if a:
                strong = a.find("strong")
                title = strong.get_text(strip=True) if strong else a.get_text(strip=True)
                article_url = urljoin(base_url, a.get("href"))
                art_response = session.get(article_url)
                if art_response.status_code != 200:
                    text = ""
                    location = ""
                else:
                    art_soup = BeautifulSoup(art_response.text, 'html.parser')
                    content = art_soup.find("div", class_="pbb-article-text")
                    text = content.get_text(separator="\n", strip=True) if content else ""
                    ort_tag = art_soup.find("p", class_="pbb-ort")
                    landkreis_tag = art_soup.find("p", class_="pbb-landkreis")
                    location = ""
                    if ort_tag:
                        location = ort_tag.get_text(strip=True)
                    if landkreis_tag:
                        location = f"{location}, {landkreis_tag.get_text(strip=True)}" if location else landkreis_tag.get_text(strip=True)
            else:
                title = ""
                text = ""
                location = ""
            p = li.find("p")
            span = p.find("span") if p else None
            if span:
                span_text = span.get_text(separator=" ", strip=True)
                date = span_text.split("Artikel vom")[-1].strip().split()[0] if "Artikel vom" in span_text else ""
            else:
                date = ""
            results.append({
                "title": title,
                "date": date,
                "location": location,
                "text": text,
                "url": article_url
            })
        page += 1
        time.sleep(3)
    with open('brandenburg_police_results.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(["Title", "Date", "Location", "Text", "URL"])
        for r in results:
            writer.writerow([r["title"], r["date"], r["location"], r["text"], r["url"]])

In [93]:
query_params = {
        "search[query]": "",
        "search[region]": "",
        "search[dienststelle]": "",
        "search[kategorien]": "Kriminalität",
        "search[onlineDateFrom]": "01.02.2025",
        "search[docType]": "",
        "search[tags]": "",
        "search[zeitraum]": "",
        "search[onlineDateTo]": "",
        "search[searchButton2]": "Suchen »"
    }

scrape_brandenburg_police(query_params)

Test URL: https://polizei.brandenburg.de/suche/typ/null/1/1?search%5Bquery%5D=&search%5Bregion%5D=&search%5Bdienststelle%5D=&search%5Bkategorien%5D=Kriminalit%C3%A4t&search%5BonlineDateFrom%5D=01.02.2025&search%5BdocType%5D=&search%5Btags%5D=&search%5Bzeitraum%5D=&search%5BonlineDateTo%5D=&search%5BsearchButton2%5D=Suchen+%C2%BB


## Topics

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import torch

df = pd.read_csv("brandenburg_police_results.csv")
combined_text = df["Title"].fillna("") + " " + df["Text"].fillna("")

model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
document_embeddings = model.encode(combined_text.tolist(), convert_to_tensor=True)


query = "Rassismus, Hass, fremdenfeindlich, Hetze, Hassverbrechen"
#query = "Volksverhetzung,volksverhetzend,Hitlergruß,Hakenkreuz,fremdenfeindlich,Fremdenfeindlichkeit,rechtsextremistisch,rechte Gesinnung,rassistisch,Rassismus,Nazi,Naziparole,verfassungswidrig,verfassungsfeindlich"

query_embedding = model.encode(query, convert_to_tensor=True)

cosine_scores = util.cos_sim(query_embedding, document_embeddings)
top_results = torch.topk(cosine_scores, k=100)
scores_list = top_results[0][0].tolist()
indices_list = top_results[1][0].tolist()

print("Top similar documents:")
for score, idx in zip(scores_list, indices_list):
    idx_int = int(idx)
    url = df.iloc[idx_int].get("URL", "")
    print(f"Score: {score:.4f}, Title: {df.iloc[idx_int]['Title']}, Date: {df.iloc[idx_int]['Date']}, URL: {url}")

matched = [{"index": idx, "score": score} for idx, score in zip(indices_list, scores_list)]
scores_df = pd.DataFrame(matched)

filtered_df = df.iloc[scores_df["index"]].copy()
filtered_df["score"] = scores_df["score"].values
filtered_df.to_csv("filtered_similar_topics.csv", index=False)


Top similar documents:
Score: 0.3262, Title: Mutmaßlich Tatverdächtige ertappt, Date: 03.02.2025, URL: https://polizei.brandenburg.de/pressemeldung/mutmasslich-tatverdaechtige-ertappt/5410824
Score: 0.3034, Title: Verfassungswidrige Parolen geäußert, Date: 07.03.2025, URL: https://polizei.brandenburg.de/pressemeldung/verfassungswidrige-parolen-geaeussert/5480296
Score: 0.2870, Title: Autos beschmiert und beschädigt, Date: 03.03.2025, URL: https://polizei.brandenburg.de/pressemeldung/autos-beschmiert-und-beschaedigt/5470438
Score: 0.2780, Title: Hakenkreuz-Schmierereien an Schulgebäude, Date: 24.02.2025, URL: https://polizei.brandenburg.de/pressemeldung/hakenkreuz-schmierereien-an-schulgebaeud/5454844
Score: 0.2766, Title: Mehrere Fahrzeuge beschmiert, Date: 25.02.2025, URL: https://polizei.brandenburg.de/pressemeldung/mehrere-fahrzeuge-beschmiert/5457626
Score: 0.2765, Title: Mehrere Graffitis gesprüht, Date: 17.02.2025, URL: https://polizei.brandenburg.de/pressemeldung/mehrere-graffit