### Polizei Scraper

In [19]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import urllib.robotparser
import csv
import time
import os
import datetime

def scrape_berlin_police():
    base_url = "https://www.berlin.de"
    year_url = "https://www.berlin.de/polizei/polizeimeldungen/archiv/2025/"
    file_path = 'data/berlin_police_results.csv'
    max_date = datetime.date(2025, 2, 1)

    existing_urls = set()
    existing_rows = []
    if os.path.exists(file_path):
        with open(file_path, 'r', encoding='utf-8', newline='') as f:
            reader = csv.DictReader(f)
            for row in reader:
                existing_urls.add(row["URL"])
                existing_rows.append(row)

    rp = urllib.robotparser.RobotFileParser()
    rp.set_url(urljoin(base_url, "robots.txt"))
    rp.read()
    if not rp.can_fetch("*", year_url):
        print("⛔ Scraping not allowed by robots.txt.")
        return

    session = requests.Session()
    session.headers.update({"User-Agent": "Mozilla/5.0"})

    new_rows = []
    page = 1
    stop_scraping = False

    while not stop_scraping:
        page_url = year_url if page == 1 else f"{year_url}?page_at_1_0={page}#headline_1_0"
        print(f"🔎 Fetching: {page_url}")
        try:
            res = session.get(page_url, timeout=30)
            if res.status_code != 200:
                print("⛔ Stopped. No more pages.")
                break
        except Exception as e:
            print(f"⚠️ Error fetching page {page}: {e}")
            break

        soup = BeautifulSoup(res.text, "html.parser")
        items = soup.select("ul.list--tablelist > li")
        if not items:
            print("📭 No more list items found.")
            break

        for item in items:
            date_div = item.find("div", class_="date")
            text_div = item.find("div", class_="text")
            if not date_div or not text_div:
                continue

            url_rel = text_div.find("a")["href"]
            article_url = urljoin(base_url, url_rel)
            if article_url in existing_urls:
                stop_scraping = True
                print("🛑 Reached already-saved article, stopping.")
                break

            title = text_div.find("a").get_text(strip=True)
            date_str = date_div.get_text(strip=True).split(" ")[0]
            try:
                article_date = datetime.datetime.strptime(date_str, "%d.%m.%Y").date()
                if article_date < max_date:
                    stop_scraping = True
                    print(f"🛑 Reached article older than cutoff: {article_date}")
                    break
            except ValueError:
                continue

            location = ""
            loc_span = text_div.find("span", class_="category")
            if loc_span and "Ereignisort:" in loc_span.text:
                location = loc_span.text.replace("Ereignisort:", "").strip()

            try:
                art_res = session.get(article_url, timeout=30)
                if art_res.status_code != 200:
                    raise Exception("Bad status")
                art_soup = BeautifulSoup(art_res.text, "html.parser")
                content = art_soup.find("div", class_="textile")
                text = content.get_text(separator="\n", strip=True) if content else ""
            except Exception as e:
                print(f"⚠️ Failed to fetch article: {article_url} – {e}")
                continue

            print(f"📍 {location} – {date_str} – {title}")
            new_rows.append({
                "Title": title,
                "Date": date_str,
                "Location": location,
                "Text": text,
                "URL": article_url
            })

        page += 1
        time.sleep(2)

    all_rows = existing_rows + new_rows
    with open(file_path, 'w', encoding='utf-8', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=["Title", "Date", "Location", "Text", "URL"])
        writer.writeheader()
        writer.writerows(all_rows)

    print(f"\n✅ Done. {len(new_rows)} new articles saved to {file_path}.")

In [21]:
scrape_berlin_police()

🔎 Fetching: https://www.berlin.de/polizei/polizeimeldungen/archiv/2025/
📍 Mitte – 16.04.2025 – Versuchter Raub – Personen mit Messer und Pfefferspray verletzt
📍 Steglitz-Zehlendorf – 16.04.2025 – Mann in hilflosem Zustand - Polizei bittet um Mithilfe
📍 Marzahn-Hellersdorf – 16.04.2025 – Gefährliche Körperverletzung in der Öffentlichkeit
📍 Marzahn-Hellersdorf – 16.04.2025 – Nationalsozialistische Parolen gerufen
📍 Treptow-Köpenick – 16.04.2025 – Verkehrsunfall – Betonmischer-Trommel beim Abbiegen auf die Fahrbahn gefallen
📍 Friedrichshain-Kreuzberg – 15.04.2025 – Nach Erpressungsversuch in die Flucht geschlagen
📍 Friedrichshain-Kreuzberg – 15.04.2025 – Motorradfahrer fährt Mann an
📍 bezirksübergreifend – 15.04.2025 – Durchsuchungen und Festnahmen wegen des Verdachts der Schutzgelderpressung
📍 Mitte – 15.04.2025 – Festnahme nach mutmaßlichem Drogenhandel
📍 Friedrichshain-Kreuzberg – 15.04.2025 – Autofahrer fährt Fußgänger an
📍 Charlottenburg-Wilmersdorf – 15.04.2025 – Untersuchungshaft w

## Topics

In [None]:
 # ollama run gemma2:2b

In [None]:
import pandas as pd
import json
import re
import ollama
from IPython.display import Markdown, display

def get_response_from_model(prompt, model):
    response = ollama.generate(model=model, prompt=prompt)
    return response['response'].strip()

model = "gemma2:2b"

keywords = ["Staatsschutz", "Volksverhetzung", "Hitlergruß", "Hakenkreuz", "Rassismus", "Nazi", "Rechtsextremistisch"]

offence = "Vandalism, Property damage, Physical assault, Threat/Intimidation, Hate speech/Incitement, Symbol use/Propaganda, Arson, Robbery, Harassment, Other"
action = "Swastika, Hitlergruß, Graffiti, Slurs, Online posts, Other"
motive = "Racism, Antisemitism, Xenophobia/Anti-immigrant, Homophobia/Transphobia, Anti-Muslim, Anti-left, Glorification of Nazism/Right-wing extremist, Other, Undetermined"

offence_prompt = (
    "You are analyzing a police report that contains clear signs of hate or extremist content.\n"
    "Return a valid JSON object with the following fields:\n"
    f"- OffenceType: a list of terms from this set: {offence}\n"
    f"- Action: a list of terms from this set: {action}\n"
    f"- PossibleMotive: a list of terms from this set: {motive}\n\n"
    "Return empty lists if no valid information is found. Respond with JSON only. No explanation or extra text."
)

df = pd.read_csv("data/berlin_police_results.csv")

offence_types, actions, motives, right_wing_flags, matched_keywords = [], [], [], [], []

for i, row in df.iterrows():
    keyword_prompt = (
        f"Below is a document:\n\n"
        f"Title: {row['Title']}\n\n"
        f"Text: {row['Text']}\n\n"
        "Question: Does this document explicitly mention any of the following terms: "
        f"{', '.join(keywords)}?\n"
        "If yes, reply with only the exact term that is mentioned; if not, reply with 'None'. "
        "Do not include any additional text or explanation."
    )
    
    keyword_hit = get_response_from_model(keyword_prompt, model).strip()
    is_related = keyword_hit in keywords
    
    if is_related:
        print(f"✅ Right-wing keyword found: {keyword_hit} in doc {i} | {row['URL']}")
        tagging_prompt = (
            f"Below is a document:\n\n"
            f"Title: {row['Title']}\n\n"
            f"Text: {row['Text']}\n\n"
            f"{offence_prompt}"
        )
        tagging_response = get_response_from_model(tagging_prompt, model)

        try:
            result = json.loads(tagging_response)
        except json.JSONDecodeError:
            match = re.search(r'\{.*\}', tagging_response, re.DOTALL)
            if match:
                try:
                    result = json.loads(match.group(0))
                except json.JSONDecodeError:
                    result = {"OffenceType": [], "Action": [], "PossibleMotive": []}
            else:
                result = {"OffenceType": [], "Action": [], "PossibleMotive": []}
    else:
        print(f"❌ Not right-wing: doc {i}")
        result = {"OffenceType": [], "Action": [], "PossibleMotive": []}

    matched_keywords.append(keyword_hit if is_related else None)
    right_wing_flags.append(is_related)
    offence_types.append(result.get("OffenceType", []))
    actions.append(result.get("Action", []))
    motives.append(result.get("PossibleMotive", []))

df["RightWingRelated"] = right_wing_flags
df["KeywordMatch"] = matched_keywords
df["OffenceType"] = offence_types
df["Action"] = actions
df["PossibleMotive"] = motives

df.to_csv("berlin_tagged_documents.csv", index=False)


❌ Not right-wing: doc 0
❌ Not right-wing: doc 1
❌ Not right-wing: doc 2
❌ Not right-wing: doc 3
❌ Not right-wing: doc 4
❌ Not right-wing: doc 5
❌ Not right-wing: doc 6
❌ Not right-wing: doc 7
❌ Not right-wing: doc 8
❌ Not right-wing: doc 9
❌ Not right-wing: doc 10
❌ Not right-wing: doc 11
❌ Not right-wing: doc 12
❌ Not right-wing: doc 13
❌ Not right-wing: doc 14
❌ Not right-wing: doc 15
❌ Not right-wing: doc 16
❌ Not right-wing: doc 17
❌ Not right-wing: doc 18
❌ Not right-wing: doc 19
❌ Not right-wing: doc 20
❌ Not right-wing: doc 21
❌ Not right-wing: doc 22
✅ Right-wing keyword found: Volksverhetzung in doc 23 | https://www.berlin.de/polizei/polizeimeldungen/2025/pressemitteilung.1551334.php
❌ Not right-wing: doc 24
❌ Not right-wing: doc 25
✅ Right-wing keyword found: Rechtsextremistisch in doc 26 | https://www.berlin.de/polizei/polizeimeldungen/2025/pressemitteilung.1551330.php
❌ Not right-wing: doc 27
❌ Not right-wing: doc 28
❌ Not right-wing: doc 29
❌ Not right-wing: doc 30
❌ Not r

## Locations

In [None]:
import re
import pandas as pd
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

geolocator = Nominatim(user_agent="precise-street-geo")
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

prompt_template = """
Extract the most specific geocodable location from the following police report.

Return a single string like:
"Street name, Town, District"
Do not include quotes, explanations, or extra punctuation.
Only return a single clean, geocodable string.
---

Text:
{incident}

Location:
{location}

Locations:
"""

right_wing_df = df[df["RightWingRelated"] == True].copy()
results = []

df["Latitude"] = None
df["Longitude"] = None

for i, row in right_wing_df.iterrows():
    prompt = prompt_template.format(
        incident=row['Text'] or "",
        location=row['Location'] or ""
    )


    response = get_response_from_model(prompt, model).strip()
    response = re.sub(r"^```(?:json|python)?\s*", "", response, flags=re.IGNORECASE)
    response = re.sub(r"\s*```$", "", response, flags=re.IGNORECASE)
    response = re.sub(r"^python\s*", "", response, flags=re.IGNORECASE)

    query1 = f"{response}, Berlin, Germany"
    query2 = f"{row['Location']}, Berlin, Germany" if pd.notna(row["Location"]) else ""
    query3 = ""

    if pd.notna(row["Location"]) and "," in row["Location"]:
        query3 = row["Location"].split(",")[-1].strip() + ", Berlin, Germany"

    location = None


    try:
        location = geocode(query1)
    except Exception as e:
        print(f"⚠️ Error geocoding primary for row {i}: {e}")


    if not location and query2:
        try:
            location = geocode(query2)
        except Exception as e:
            print(f"⚠️ Error geocoding fallback for row {i}: {e}")


    if not location and query3:
        try:
            location = geocode(query3)
        except Exception as e:
            print(f"⚠️ Error geocoding last fallback for row {i}: {e}")


    if location:
        print(f"📍 Row {i}: {location.address} → {location.latitude}, {location.longitude}")
        results.append({
            "Query": response,
            "Latitude": location.latitude,
            "Longitude": location.longitude,
            "TextExcerpt": row["Text"],
            "URL": row["URL"]
        })
        df.at[i, "Latitude"] = location.latitude
        df.at[i, "Longitude"] = location.longitude
    else:
        print(f"⚠️ Geocoding failed for all fallbacks on row {i}: {query1} / {query2} / {query3}")

# geo_df = pd.DataFrame(results)
# geo_df.to_csv("data/berlin_geocoded.csv", index=False)

df.to_csv("data/berlin_geocoded.csv", index=False)

df.head()


📍 Row 23: Elise-und-Otto-Hampel-Weg, Brüsseler Kiez, Wedding, Mitte, Berlin, 13353, Deutschland → 52.54729095238678, 13.356507892676314
📍 Row 26: Müllerstraße, Englisches Viertel, Wedding, Mitte, Berlin, 13349, Deutschland → 52.55980013378506, 13.33538174091742
📍 Row 47: Arnswalder Platz, Bötzowkiez, Prenzlauer Berg, Pankow, Berlin, 10407, Deutschland → 52.5330863, 13.437229919696318
📍 Row 53: Köpenicker Straße, Luisenstadt, Mitte, Berlin, 10179, Deutschland → 52.51120356141707, 13.415609900981751
📍 Row 58: Tempelhof-Schöneberg, Berlin, Teltow, Deutschland → 52.4487714, 13.3893312
📍 Row 59: Marzahn-Hellersdorf, Berlin, Deutschland → 52.5225225, 13.5876634
📍 Row 95: Hansaviertel, Mitte, Berlin, 10557, Deutschland → 52.5191234, 13.3418725
📍 Row 105: Alexanderplatz, Mitte, Berlin, 10178, Deutschland → 52.5219814, 13.413635762871088
📍 Row 111: Mitte, Berlin, Deutschland → 52.517012, 13.3888222
📍 Row 116: Gartenfelder Straße, Haselhorst, Spandau, Berlin, 13599, Deutschland → 52.544192133244

Unnamed: 0,Query,Latitude,Longitude,TextExcerpt,URL
0,"Elise-und-Otto-Hampel-Weg, Wedding, Mitte",52.547291,13.356508,Nr. 0932\nGestern Mittag leiteten Polizeikräft...,https://www.berlin.de/polizei/polizeimeldungen...
1,"Müllerstraße, Wedding, Mitte",52.5598,13.335382,Nr. 0929\nMit rund 200 Einsatzkräften schützte...,https://www.berlin.de/polizei/polizeimeldungen...
2,"Arnswalder Platz, Prenzlauer Berg, Pankow",52.533086,13.43723,Nr. 0907\nEine Frau wurde gestern Nachmittag i...,https://www.berlin.de/polizei/polizeimeldungen...
3,"Köpenicker Straße, Mitte, Berlin",52.511204,13.41561,Nr. 0900\nGestern Mittag wurde ein Mann in Mit...,https://www.berlin.de/polizei/polizeimeldungen...
4,"Waldsasser Straße, Marienfelde, Tempelhof-Schö...",52.448771,13.389331,Nr. 0895\nHeute Morgen soll in Marienfelde ein...,https://www.berlin.de/polizei/polizeimeldungen...


In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import urllib.robotparser
import csv
import time
import os
import datetime

def scrape_brandenburg_police():
    base_url = "https://polizei.brandenburg.de"
    
    url_template = f"{base_url}/suche/typ/null/kategorie/Kriminalit%C3%A4t/{{page}}/1?reset=1"

    file_path = 'brandenburg_police_results.csv'
    existing = []
    last_date = None

    
    if os.path.exists(file_path):
        with open(file_path, 'r', newline='', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:
                record = {
                    "title": row.get("Title", ""),
                    "date": row.get("Date", ""),
                    "location": row.get("Location", ""),
                    "text": row.get("Text", ""),
                    "url": row.get("URL", "")
                }
                existing.append(record)
                try:
                    d = datetime.datetime.strptime(record["date"], '%d.%m.%Y').date()
                    if last_date is None or d > last_date:
                        last_date = d
                except ValueError:
                    pass
        if last_date:
            print("Last date in CSV:", last_date.strftime('%d.%m.%Y'))

    
    rp = urllib.robotparser.RobotFileParser()
    rp.set_url(urljoin(base_url, "robots.txt"))
    rp.read()
    test_url = url_template.format(page=1)
    if not rp.can_fetch("*", test_url):
        print(f"Scraping disallowed for URL: {test_url}")
        return

    session = requests.Session()
    session.headers.update({"User-Agent": "Mozilla/5.0"})

    results = []
    page = 1
    stop_scraping = False

    
    while not stop_scraping:
        page_url = url_template.format(page=page)
        print("Fetching page:", page_url)
        try:
            response = session.get(page_url, timeout=30)
        except requests.exceptions.RequestException as e:
            print(f"Connection error on page {page}: {e}")
            break

        
        if response.status_code == 500:
            print(f"Page {page} returned 500 error, skipping this page.")
            page += 1
            time.sleep(5)  
            continue
        elif response.status_code != 200:
            print("No more pages or unexpected status:", response.status_code)
            break

        soup = BeautifulSoup(response.text, 'html.parser')
        ul = soup.find("ul", class_=lambda x: x and "pbb-searchlist" in x)
        if not ul:
            print("No search results container found on page", page)
            break

        items = ul.find_all("li")
        if not items:
            print("No <li> items found on page", page)
            break

        for li in items:
            h4 = li.find("h4")
            a = h4.find("a") if h4 else None
            if not a:
                continue

            strong = a.find("strong")
            title = strong.get_text(strip=True) if strong else a.get_text(strip=True)
            article_url = urljoin(base_url, a.get("href"))

            p = li.find("p")
            date_str = ""
            if p:
                span = p.find("span")
                if span:
                    span_text = span.get_text(separator=" ", strip=True)
                    if "Artikel vom" in span_text:
                        date_str = span_text.split("Artikel vom")[-1].strip().split()[0]

            article_date = None
            if date_str:
                try:
                    article_date = datetime.datetime.strptime(date_str, '%d.%m.%Y').date()
                except ValueError:
                    pass

            
            if last_date and article_date and article_date <= last_date:
                print(
                    f"Reached older/equal article dated {article_date.strftime('%d.%m.%Y')} "
                    f"(last saved: {last_date.strftime('%d.%m.%Y')}). Stopping."
                )
                stop_scraping = True
                break

            try:
                art_response = session.get(article_url, timeout=30)
            except requests.exceptions.RequestException as e:
                print(f"Connection error retrieving article page: {e}")
                continue

            if art_response.status_code != 200:
                text = ""
                location = ""
            else:
                art_soup = BeautifulSoup(art_response.text, 'html.parser')
                content = art_soup.find("div", class_="pbb-article-text")
                text = content.get_text(separator="\n", strip=True) if content else ""

                ort_tag = art_soup.find("p", class_="pbb-ort")
                landkreis_tag = art_soup.find("p", class_="pbb-landkreis")
                location = ""
                if ort_tag:
                    location = ort_tag.get_text(strip=True)
                if landkreis_tag:
                    location += ", " + landkreis_tag.get_text(strip=True) if location else landkreis_tag.get_text(strip=True)

            results.append({
                "title": title,
                "date": date_str,
                "location": location,
                "text": text,
                "url": article_url
            })

        page += 1
        time.sleep(3)

    combined_results = existing + results
    with open(file_path, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(["Title", "Date", "Location", "Text", "URL"])
        for r in combined_results:
            writer.writerow([r["title"], r["date"], r["location"], r["text"], r["url"]])

    print(f"\nScraping complete. Found {len(results)} new articles.")
