### Polizei Scraper

In [7]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import urllib.robotparser
import csv
import time
from requests.models import PreparedRequest
import os
import datetime

def scrape_brandenburg_police(params):
    base_url = "https://polizei.brandenburg.de"
    url_pattern = f"{base_url}/suche/typ/null/{{page}}/1"
    req = PreparedRequest()
    req.prepare_url(url_pattern.format(page=1), params)
    test_url = req.url
    print("Test URL:", test_url)
    
    file_path = 'brandenburg_police_results.csv'
    existing = []
    last_date = None
    if os.path.exists(file_path):
        with open(file_path, 'r', newline='', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:
                # Convert keys from CSV (e.g., "Title") to lowercase keys (e.g., "title")
                record = {
                    "title": row.get("Title", ""),
                    "date": row.get("Date", ""),
                    "location": row.get("Location", ""),
                    "text": row.get("Text", ""),
                    "url": row.get("URL", "")
                }
                existing.append(record)
                try:
                    d = datetime.datetime.strptime(record["date"], '%d.%m.%Y').date()
                    if last_date is None or d > last_date:
                        last_date = d
                except Exception:
                    pass
        if last_date:
            new_start = (last_date + datetime.timedelta(days=1)).strftime('%d.%m.%Y')
            params["search[onlineDateFrom]"] = new_start
            print("Updated start date to:", new_start)
    
    rp = urllib.robotparser.RobotFileParser()
    rp.set_url(urljoin(base_url, "robots.txt"))
    rp.read()
    if not rp.can_fetch("*", test_url):
        print(f"Scraping disallowed for URL: {test_url}")
        return
    session = requests.Session()
    session.headers.update({"User-Agent": "Mozilla/5.0"})
    results = []
    page = 1
    while True:
        url = url_pattern.format(page=page)
        response = session.get(url, params=params)
        if response.status_code != 200:
            break
        soup = BeautifulSoup(response.text, 'html.parser')
        ul = soup.find("ul", class_=lambda x: x and "pbb-searchlist" in x)
        if not ul:
            break
        items = ul.find_all("li")
        if not items:
            break
        for li in items:
            article_url = ""
            h4 = li.find("h4")
            a = h4.find("a") if h4 else None
            if a:
                strong = a.find("strong")
                title = strong.get_text(strip=True) if strong else a.get_text(strip=True)
                article_url = urljoin(base_url, a.get("href"))
                art_response = session.get(article_url)
                if art_response.status_code != 200:
                    text = ""
                    location = ""
                else:
                    art_soup = BeautifulSoup(art_response.text, 'html.parser')
                    content = art_soup.find("div", class_="pbb-article-text")
                    text = content.get_text(separator="\n", strip=True) if content else ""
                    ort_tag = art_soup.find("p", class_="pbb-ort")
                    landkreis_tag = art_soup.find("p", class_="pbb-landkreis")
                    location = ""
                    if ort_tag:
                        location = ort_tag.get_text(strip=True)
                    if landkreis_tag:
                        location = f"{location}, {landkreis_tag.get_text(strip=True)}" if location else landkreis_tag.get_text(strip=True)
            else:
                title = ""
                text = ""
                location = ""
            p = li.find("p")
            span = p.find("span") if p else None
            if span:
                span_text = span.get_text(separator=" ", strip=True)
                date = span_text.split("Artikel vom")[-1].strip().split()[0] if "Artikel vom" in span_text else ""
            else:
                date = ""
            
            if last_date and date:
                try:
                    record_date = datetime.datetime.strptime(date, '%d.%m.%Y').date()
                    if record_date <= last_date:
                        continue
                except Exception:
                    pass
            
            results.append({
                "title": title,
                "date": date,
                "location": location,
                "text": text,
                "url": article_url
            })
        page += 1
        time.sleep(3)
    
    combined_results = existing + results

    with open('brandenburg_police_results.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(["Title", "Date", "Location", "Text", "URL"])
        for r in combined_results:
            writer.writerow([r["title"], r["date"], r["location"], r["text"], r["url"]])

In [9]:
query_params = {
    "search[query]": "",
    "search[region]": "",
    "search[dienststelle]": "",
    "search[kategorien]": "Kriminalität",
    "search[onlineDateFrom]": "01.02.2025", 
    "search[docType]": "",
    "search[tags]": "",
    "search[zeitraum]": "",
    "search[onlineDateTo]": "",
    "search[searchButton2]": "Suchen »"
}

scrape_brandenburg_police(query_params)

Test URL: https://polizei.brandenburg.de/suche/typ/null/1/1?search%5Bquery%5D=&search%5Bregion%5D=&search%5Bdienststelle%5D=&search%5Bkategorien%5D=Kriminalit%C3%A4t&search%5BonlineDateFrom%5D=01.02.2025&search%5BdocType%5D=&search%5Btags%5D=&search%5Bzeitraum%5D=&search%5BonlineDateTo%5D=&search%5BsearchButton2%5D=Suchen+%C2%BB
Updated start date to: 11.03.2025


## Topics

In [None]:
 # ollama run gemma2:2b

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[?2026h[?25l[1Gpulling manifest ⠋ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠙ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠹ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠸ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠼ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠴ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠦ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠧ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠇ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠏ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠋ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠙ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠹ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest [K
pulling 7462734796d6...   0% ▕                ▏    0 B/1.6 GB                  [K[?25h[?2026l[?2026h[?25l[A[1Gpulling manifest [K
pulling 7462734796d6...   0% ▕                ▏    0 B/1.6 GB                  [K[?25h[?2026l[?2026h[?25l[A[1Gpull

OSError: [Errno 5] Input/output error

In [11]:
import pandas as pd
import ollama
from IPython.display import Markdown, display

def get_response_from_model(prompt, model):
    response = ollama.generate(model=model, prompt=prompt)
    return response['response'].strip()

keywords = "Volksverhetzung, Hitlergruß, Hakenkreuz, Rassismus, Nazi, Rechtsextremistisch"

model = "gemma2:2b"

df = pd.read_csv("brandenburg_police_results.csv")

tags = []
for i, row in df.iterrows():
    prompt = (
        f"Below is a document:\n\n"
        f"Title: {row['Title']}\n\n"
        f"Text: {row['Text']}\n\n"
        "Question: Does this document explicitly mention any of the following terms: "
        f"{keywords}? If yes, reply with only the exact term that is mentioned; if not, reply with 'None'. "
        #"If the document explicitly mentions (verbatim) another term verbatim that clearly indicates a right-wing or hate-related topic not in the list, reply with that term instead. "
        "Do not include any additional text or explanation."
    )
    tag = get_response_from_model(prompt, model)
    print(f"Document {i} tag: {tag} url: {row['URL']}")
    tags.append(tag)

df["Tag"] = tags
df.to_csv("tagged_documents.csv", index=False)

Document 0 tag: None url: https://polizei.brandenburg.de/pressemeldung/keller-aufgebrochen/5486361
Document 1 tag: None url: https://polizei.brandenburg.de/pressemeldung/jugendlicher-fiel-unangenehm-auf/5486356
Document 2 tag: None url: https://polizei.brandenburg.de/pressemeldung/ermittlungen-laufen/5486345
Document 3 tag: None url: https://polizei.brandenburg.de/pressemeldung/einbrecher-im-haus/5486343
Document 4 tag: Volksverhetzung url: https://polizei.brandenburg.de/pressemeldung/ein-fall-fuer-den-polizeilichen-staatssc/5486325
Document 5 tag: Hakenkreuz url: https://polizei.brandenburg.de/pressemeldung/der-staatsschutz-ermittelt/5486316
Document 6 tag: None url: https://polizei.brandenburg.de/pressemeldung/raeuber-schlugen-auch-noch-zu/5486294
Document 7 tag: None url: https://polizei.brandenburg.de/pressemeldung/als-taeter-identifiziert/5486227
Document 8 tag: Hakenkreuz url: https://polizei.brandenburg.de/pressemeldung/hakenkreuz-geschmiert/5486210
Document 9 tag: None url: htt

In [12]:
df_filtered = df[df["Tag"].str.lower() != "none"]
df_filtered.to_csv("tagged_documents_filtered.csv", index=False)