### Polizei Scraper

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import urllib.robotparser
import csv
import time
import os
import datetime

def scrape_brandenburg_police():
    base_url = "https://polizei.brandenburg.de"
    url_template = f"{base_url}/suche/typ/null/kategorie/Kriminalit%C3%A4t/{{page}}/1?reset=1"
    file_path = 'data/brandenburg_police_results.csv'
    max_date = datetime.date(2025, 2, 1) 

    existing_urls = set()
    existing_data = []

    if os.path.exists(file_path):
        with open(file_path, 'r', newline='', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:
                existing_data.append(row)
                existing_urls.add(row["URL"])

    rp = urllib.robotparser.RobotFileParser()
    rp.set_url(urljoin(base_url, "robots.txt"))
    rp.read()
    if not rp.can_fetch("*", url_template.format(page=1)):
        print("⛔ Scraping disallowed by robots.txt")
        return

    session = requests.Session()
    session.headers.update({"User-Agent": "Mozilla/5.0"})

    results = []
    page = 1
    stop_scraping = False

    while not stop_scraping:
        page_url = url_template.format(page=page)
        print("🔎 Fetching:", page_url)
        try:
            response = session.get(page_url, timeout=30)
        except requests.exceptions.RequestException as e:
            print(f"⚠️ Request error: {e}")
            break

        if response.status_code == 500:
            print(f"⚠️ Page {page} returned 500. Skipping.")
            page += 1
            time.sleep(5)
            continue
        elif response.status_code != 200:
            print(f"⛔ Unexpected status: {response.status_code}")
            break

        soup = BeautifulSoup(response.text, 'html.parser')
        ul = soup.find("ul", class_=lambda x: x and "pbb-searchlist" in x)
        if not ul:
            print("⛔ No search results container found.")
            break

        items = ul.find_all("li")
        if not items:
            print("⛔ No results found on page.")
            break

        for li in items:
            h4 = li.find("h4")
            a = h4.find("a") if h4 else None
            if not a:
                continue

            strong = a.find("strong")
            title = strong.get_text(strip=True) if strong else a.get_text(strip=True)
            article_url = urljoin(base_url, a.get("href"))

            if article_url in existing_urls:
                print(f"⏭️ Already saved: {article_url}")
                continue

            p = li.find("p")
            date_str = ""
            if p:
                span = p.find("span")
                if span:
                    span_text = span.get_text(separator=" ", strip=True)
                    if "Artikel vom" in span_text:
                        date_str = span_text.split("Artikel vom")[-1].strip().split()[0]

            article_date = None
            if date_str:
                try:
                    article_date = datetime.datetime.strptime(date_str, '%d.%m.%Y').date()
                except ValueError:
                    pass

            if article_date and article_date <= max_date:
                print(f"🛑 Reached max date ({article_date.strftime('%d.%m.%Y')}). Stopping.")
                stop_scraping = True
                break

            try:
                art_response = session.get(article_url, timeout=30)
            except requests.exceptions.RequestException as e:
                print(f"⚠️ Article fetch failed: {e}")
                continue

            if art_response.status_code != 200:
                text = ""
                location = ""
            else:
                art_soup = BeautifulSoup(art_response.text, 'html.parser')
                content = art_soup.find("div", class_="pbb-article-text")
                text = content.get_text(separator="\n", strip=True) if content else ""

                ort_tag = art_soup.find("p", class_="pbb-ort")
                landkreis_tag = art_soup.find("p", class_="pbb-landkreis")
                location = ""
                if ort_tag:
                    location = ort_tag.get_text(strip=True)
                if landkreis_tag:
                    location += ", " + landkreis_tag.get_text(strip=True) if location else landkreis_tag.get_text(strip=True)

            results.append({
                "title": title,
                "date": date_str,
                "location": location,
                "text": text,
                "url": article_url
            })

        page += 1
        time.sleep(3)

    combined = existing_data + results
    with open(file_path, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(["Title", "Date", "Location", "Text", "URL"])
        for r in combined:
            writer.writerow([r["title"], r["date"], r["location"], r["text"], r["url"]])

    print(f"\n✅ Scraping complete. {len(results)} new articles saved.")

In [2]:
scrape_brandenburg_police()

Fetching page: https://polizei.brandenburg.de/suche/typ/null/kategorie/Kriminalit%C3%A4t/1/1?reset=1


KeyboardInterrupt: 

## Topics

In [None]:
 # ollama run gemma2:2b

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[?2026h[?25l[1Gpulling manifest ⠋ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠙ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠹ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠸ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠼ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠴ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠦ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠧ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠇ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠏ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠋ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠙ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠹ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest [K
pulling 7462734796d6...   0% ▕                ▏    0 B/1.6 GB                  [K[?25h[?2026l[?2026h[?25l[A[1Gpulling manifest [K
pulling 7462734796d6...   0% ▕                ▏    0 B/1.6 GB                  [K[?25h[?2026l[?2026h[?25l[A[1Gpull

OSError: [Errno 5] Input/output error

In [None]:
import pandas as pd
import json
import re
import ollama
from IPython.display import Markdown, display

def get_response_from_model(prompt, model):
    response = ollama.generate(model=model, prompt=prompt)
    return response['response'].strip()

model = "gemma2:2b"

keywords = ["Volksverhetzung", "Hitlergruß", "Hakenkreuz", "Rassismus", "Nazi", "Rechtsextremistisch"]

offence = "Vandalism, Property damage, Physical assault, Threat/Intimidation, Hate speech/Incitement, Symbol use/Propaganda, Arson, Robbery, Harassment, Other"
action = "Swastika, Hitlergruß, Graffiti, Slurs, Online posts, Other"
motive = "Racism, Antisemitism, Xenophobia/Anti-immigrant, Homophobia/Transphobia, Anti-Muslim, Anti-left, Glorification of Nazism/Right-wing extremist, Other, Undetermined"

offence_prompt = (
    "You are analyzing a police report that contains clear signs of hate or extremist content.\n"
    "Return a valid JSON object with the following fields:\n"
    f"- OffenceType: a list of terms from this set: {offence}\n"
    f"- Action: a list of terms from this set: {action}\n"
    f"- PossibleMotive: a list of terms from this set: {motive}\n\n"
    "Return empty lists if no valid information is found. Respond with JSON only. No explanation or extra text."
)

df = pd.read_csv("brandenburg_police_results.csv")

offence_types, actions, motives, right_wing_flags, matched_keywords = [], [], [], [], []

for i, row in df.iterrows():
    keyword_prompt = (
        f"Below is a document:\n\n"
        f"Title: {row['Title']}\n\n"
        f"Text: {row['Text']}\n\n"
        "Question: Does this document explicitly mention any of the following terms: "
        f"{', '.join(keywords)}?\n"
        "If yes, reply with only the exact term that is mentioned; if not, reply with 'None'. "
        "Do not include any additional text or explanation."
    )
    
    keyword_hit = get_response_from_model(keyword_prompt, model).strip()
    is_related = keyword_hit in keywords
    
    if is_related:
        print(f"✅ Right-wing keyword found: {keyword_hit} in doc {i} | {row['URL']}")
        tagging_prompt = (
            f"Below is a document:\n\n"
            f"Title: {row['Title']}\n\n"
            f"Text: {row['Text']}\n\n"
            f"{offence_prompt}"
        )
        tagging_response = get_response_from_model(tagging_prompt, model)

        try:
            result = json.loads(tagging_response)
        except json.JSONDecodeError:
            match = re.search(r'\{.*\}', tagging_response, re.DOTALL)
            if match:
                try:
                    result = json.loads(match.group(0))
                except json.JSONDecodeError:
                    result = {"OffenceType": [], "Action": [], "PossibleMotive": []}
            else:
                result = {"OffenceType": [], "Action": [], "PossibleMotive": []}
    else:
        print(f"❌ Not right-wing: doc {i}")
        result = {"OffenceType": [], "Action": [], "PossibleMotive": []}

    matched_keywords.append(keyword_hit if is_related else None)
    right_wing_flags.append(is_related)
    offence_types.append(result.get("OffenceType", []))
    actions.append(result.get("Action", []))
    motives.append(result.get("PossibleMotive", []))

df["RightWingRelated"] = right_wing_flags
df["KeywordMatch"] = matched_keywords
df["OffenceType"] = offence_types
df["Action"] = actions
df["PossibleMotive"] = motives

df.to_csv("data/brandenburg_tagged_documents.csv", index=False)


KeyboardInterrupt: 

## Locations

In [None]:
import re
import pandas as pd
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

geolocator = Nominatim(user_agent="precise-street-geo")
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

prompt_template = """
Extract the most specific geocodable location from the following police report.

Return a single string like:
"Street name, Town, District"
Do not include quotes, explanations, or extra punctuation.
Only return a single clean, geocodable string.
---

Text:
{incident}

Location:
{location}

Locations:
"""

right_wing_df = df[df["RightWingRelated"] == True].copy()
results = []

df["Latitude"] = None
df["Longitude"] = None

for i, row in right_wing_df.iterrows():
    prompt = prompt_template.format(
        incident=row['Text'] or "",
        location=row['Location'] or ""
    )


    response = get_response_from_model(prompt, model).strip()
    response = re.sub(r"^```(?:json|python)?\s*", "", response, flags=re.IGNORECASE)
    response = re.sub(r"\s*```$", "", response, flags=re.IGNORECASE)
    response = re.sub(r"^python\s*", "", response, flags=re.IGNORECASE)

    query1 = f"{response}, Brandenburg, Germany"
    query2 = f"{row['Location']}, Brandenburg, Germany" if pd.notna(row["Location"]) else ""
    query3 = ""

    if pd.notna(row["Location"]) and "," in row["Location"]:
        query3 = row["Location"].split(",")[-1].strip() + ", Brandenburg, Germany"

    location = None


    try:
        location = geocode(query1)
    except Exception as e:
        print(f"⚠️ Error geocoding primary for row {i}: {e}")


    if not location and query2:
        try:
            location = geocode(query2)
        except Exception as e:
            print(f"⚠️ Error geocoding fallback for row {i}: {e}")


    if not location and query3:
        try:
            location = geocode(query3)
        except Exception as e:
            print(f"⚠️ Error geocoding last fallback for row {i}: {e}")


    if location:
        print(f"📍 Row {i}: {location.address} → {location.latitude}, {location.longitude}")
        results.append({
            "Query": response,
            "Latitude": location.latitude,
            "Longitude": location.longitude,
            "TextExcerpt": row["Text"],
            "URL": row["URL"]
        })
        df.at[i, "Latitude"] = location.latitude
        df.at[i, "Longitude"] = location.longitude
    else:
        print(f"⚠️ Geocoding failed for all fallbacks on row {i}: {query1} / {query2} / {query3}")

geo_df = pd.DataFrame(results)
geo_df.to_csv("data/brandenburg_geocoded.csv", index=False)

df.to_csv("data/brandenburg_geocoded_1.csv", index=False)

geo_df.head()


📍 Row 4: Bohmstraße, Alt-Vogelsdorf, Vogelsdorf, Fredersdorf-Vogelsdorf, Märkisch-Oderland, Brandenburg, 15370, Deutschland → 52.505046578980036, 13.745577166933131
📍 Row 5: Lindenallee, Birkenstein, Dahlwitz-Hoppegarten, Hoppegarten, Fredersdorf-Vogelsdorf, Märkisch-Oderland, Brandenburg, 15366, Deutschland → 52.511415728465565, 13.66131801929325
📍 Row 8: Hellersdorfer Weg, Eiche Süd, Eiche, Ahrensfelde, Barnim, Brandenburg, 16356, Deutschland → 52.54879071003449, 13.593324820753207
📍 Row 50: Bootsweg, Zentrum, Schwedt/Oder, Uckermark, Brandenburg, 16303, Deutschland → 53.05156288339012, 14.280726098629222
📍 Row 52: Potsdamer Straße, Ludwigsfelde, Teltow-Fläming, Brandenburg, 14974, Deutschland → 52.299988150000004, 13.2616479
📍 Row 135: Bernau (bei Berlin), Ladestraße, Stadtkern, Bernau, Barnim, Brandenburg, 16321, Deutschland → 52.6753854, 13.5919415
📍 Row 137: Bernau (bei Berlin), Ladestraße, Stadtkern, Bernau, Barnim, Brandenburg, 16321, Deutschland → 52.6753854, 13.5919415
📍 Row 

Unnamed: 0,Query,Latitude,Longitude,TextExcerpt,URL
0,"Bohmstraße, Vogelsdorf, Märkisch-Oderland",52.505047,13.745577,Am frühen Morgen des 09.03.2025 wandte sich ei...,https://polizei.brandenburg.de/pressemeldung/e...
1,"Lindenallee, Dahlwitz-Hoppegarten, Märkisch-Od...",52.511416,13.661318,Am Abend des 08.03.2025 wurde im Bereich der B...,https://polizei.brandenburg.de/pressemeldung/d...
2,"Hellersdorfer Weg, Ahrensfelde, Barnim",52.548791,13.593325,"Wie der Polizei am 08.03.2025 angezeigt wurde,...",https://polizei.brandenburg.de/pressemeldung/h...
3,"Bootsweg, Schwedt/Oder, Uckermark",53.051563,14.280726,Am 06.03.2025 wurden im Bereich des Bootsweges...,https://polizei.brandenburg.de/pressemeldung/h...
4,"Potsdamer Straße, Ludwigsfelde, Teltow-Fläming",52.299988,13.261648,Zeugen haben die Polizei am Donnerstagabend da...,https://polizei.brandenburg.de/pressemeldung/v...


In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import urllib.robotparser
import csv
import time
import os
import datetime

def scrape_brandenburg_police():
    base_url = "https://polizei.brandenburg.de"
    
    url_template = f"{base_url}/suche/typ/null/kategorie/Kriminalit%C3%A4t/{{page}}/1?reset=1"

    file_path = 'brandenburg_police_results.csv'
    existing = []
    last_date = None

    
    if os.path.exists(file_path):
        with open(file_path, 'r', newline='', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:
                record = {
                    "title": row.get("Title", ""),
                    "date": row.get("Date", ""),
                    "location": row.get("Location", ""),
                    "text": row.get("Text", ""),
                    "url": row.get("URL", "")
                }
                existing.append(record)
                try:
                    d = datetime.datetime.strptime(record["date"], '%d.%m.%Y').date()
                    if last_date is None or d > last_date:
                        last_date = d
                except ValueError:
                    pass
        if last_date:
            print("Last date in CSV:", last_date.strftime('%d.%m.%Y'))

    
    rp = urllib.robotparser.RobotFileParser()
    rp.set_url(urljoin(base_url, "robots.txt"))
    rp.read()
    test_url = url_template.format(page=1)
    if not rp.can_fetch("*", test_url):
        print(f"Scraping disallowed for URL: {test_url}")
        return

    session = requests.Session()
    session.headers.update({"User-Agent": "Mozilla/5.0"})

    results = []
    page = 1
    stop_scraping = False

    
    while not stop_scraping:
        page_url = url_template.format(page=page)
        print("Fetching page:", page_url)
        try:
            response = session.get(page_url, timeout=30)
        except requests.exceptions.RequestException as e:
            print(f"Connection error on page {page}: {e}")
            break

        
        if response.status_code == 500:
            print(f"Page {page} returned 500 error, skipping this page.")
            page += 1
            time.sleep(5)  
            continue
        elif response.status_code != 200:
            print("No more pages or unexpected status:", response.status_code)
            break

        soup = BeautifulSoup(response.text, 'html.parser')
        ul = soup.find("ul", class_=lambda x: x and "pbb-searchlist" in x)
        if not ul:
            print("No search results container found on page", page)
            break

        items = ul.find_all("li")
        if not items:
            print("No <li> items found on page", page)
            break

        for li in items:
            h4 = li.find("h4")
            a = h4.find("a") if h4 else None
            if not a:
                continue

            strong = a.find("strong")
            title = strong.get_text(strip=True) if strong else a.get_text(strip=True)
            article_url = urljoin(base_url, a.get("href"))

            p = li.find("p")
            date_str = ""
            if p:
                span = p.find("span")
                if span:
                    span_text = span.get_text(separator=" ", strip=True)
                    if "Artikel vom" in span_text:
                        date_str = span_text.split("Artikel vom")[-1].strip().split()[0]

            article_date = None
            if date_str:
                try:
                    article_date = datetime.datetime.strptime(date_str, '%d.%m.%Y').date()
                except ValueError:
                    pass

            
            if last_date and article_date and article_date <= last_date:
                print(
                    f"Reached older/equal article dated {article_date.strftime('%d.%m.%Y')} "
                    f"(last saved: {last_date.strftime('%d.%m.%Y')}). Stopping."
                )
                stop_scraping = True
                break

            try:
                art_response = session.get(article_url, timeout=30)
            except requests.exceptions.RequestException as e:
                print(f"Connection error retrieving article page: {e}")
                continue

            if art_response.status_code != 200:
                text = ""
                location = ""
            else:
                art_soup = BeautifulSoup(art_response.text, 'html.parser')
                content = art_soup.find("div", class_="pbb-article-text")
                text = content.get_text(separator="\n", strip=True) if content else ""

                ort_tag = art_soup.find("p", class_="pbb-ort")
                landkreis_tag = art_soup.find("p", class_="pbb-landkreis")
                location = ""
                if ort_tag:
                    location = ort_tag.get_text(strip=True)
                if landkreis_tag:
                    location += ", " + landkreis_tag.get_text(strip=True) if location else landkreis_tag.get_text(strip=True)

            results.append({
                "title": title,
                "date": date_str,
                "location": location,
                "text": text,
                "url": article_url
            })

        page += 1
        time.sleep(3)

    combined_results = existing + results
    with open(file_path, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(["Title", "Date", "Location", "Text", "URL"])
        for r in combined_results:
            writer.writerow([r["title"], r["date"], r["location"], r["text"], r["url"]])

    print(f"\nScraping complete. Found {len(results)} new articles.")
