In [1]:
import time
import csv
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

def get_driver():
    options = Options()
    options.add_argument("--headless=new")
    options.add_argument("--window-size=1920,1080")
    return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)


In [3]:
from rapidfuzz import fuzz

def normalize_name(name):
    return " ".join(name.lower().split())

def is_fuzzy_match(name_from_site, target_name, threshold=92):
    n1 = normalize_name(name_from_site)
    n2 = normalize_name(target_name)

    # Multiple scoring strategies for reliability
    scores = [
        fuzz.token_sort_ratio(n1, n2),     # Handles middle names well
        fuzz.token_set_ratio(n1, n2),      # Handles extra/missing tokens
        fuzz.partial_ratio(n1, n2),        # Handles substring cases
    ]

    best_score = max(scores)
    return best_score >= threshold, best_score


In [4]:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

def click_next_page(driver, wait):
    """
    Attempts to click the 'Next' button in the Angular Material paginator.
    Returns True if clicked, False if no more pages.
    """
    try:
        # Find the 'Next' button
        buttons = driver.find_elements(By.CSS_SELECTOR, "button.mat-paginator-navigation-next")

        if not buttons:
            return False  # No next button found

        next_button = buttons[0]

        # Check if disabled (last page)
        if "mat-button-disabled" in next_button.get_attribute("class"):
            return False

        # Scroll into view and click
        driver.execute_script("arguments[0].scrollIntoView();", next_button)
        driver.execute_script("arguments[0].click();", next_button)

        # Wait for new cards to load
        wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "mat-card")))
        time.sleep(2)
        return True

    except Exception as e:
        print("Pagination error:", e)
        return False


In [5]:
BASE = "https://www.odmp.org"
STATE = "california"

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def collect_officer_urls_selenium(state):
    driver = get_driver()
    wait = WebDriverWait(driver, 10)

    start_url = f"{BASE}/search/browse/{state}"
    driver.get(start_url)

    officer_links = set()
    page_number = 1

    while True:
        print(f"Scanning browse page {page_number}...")

        # Wait for cards to load
        wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "mat-card")))

        cards = driver.find_elements(By.CSS_SELECTOR, "mat-card")

        for i in range(len(cards)):
            try:
                cards = driver.find_elements(By.CSS_SELECTOR, "mat-card")  # re-fetch after navigation
                card = cards[i]

                driver.execute_script("arguments[0].scrollIntoView();", card)
                time.sleep(1)

                driver.execute_script("arguments[0].click();", card)

                # Wait for profile page to load (URL contains /officer/)
                wait.until(EC.url_contains("/officer/"))
                officer_links.add(driver.current_url)
                print("  Found:", driver.current_url)

                driver.back()
                wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "mat-card")))
                time.sleep(2)

            except Exception as e:
                print("  Skipping card due to error:", e)

        # Try to go to next page using Angular Material paginator
        if click_next_page(driver, wait):
            page_number += 1
        else:
            break

    driver.quit()
    return sorted(officer_links)



In [6]:
def scrape_officer_profile(driver, url):
    wait = WebDriverWait(driver, 10)
    driver.get(url)

    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "h1")))
    soup = BeautifulSoup(driver.page_source, "lxml")

    data = {"source_url": url}

    h1 = soup.find("h1")
    data["name"] = h1.get_text(strip=True) if h1 else None

    bio_section = soup.find("div", class_="bio") or soup.find("section", id="bio")
    data["bio"] = bio_section.get_text(" ", strip=True) if bio_section else None

    from bs4 import NavigableString

    def get_value_after_strong(soup, label):
        strong_tags = soup.find_all("strong", string=lambda s: s and label.lower() in s.lower())
        for tag in strong_tags:
            next_node = tag.next_sibling
            if isinstance(next_node, NavigableString):
                value = next_node.strip()
                if value:
                    return value
        return None


    data["age"] = get_value_after_strong(soup, "Age:")
    data["tour"] = get_value_after_strong(soup, "Tour:")
    data["badge"] = get_value_after_strong(soup, "Badge:")
    data["cause"] = get_value_after_strong(soup, "Cause:")
    data["end_of_watch"] = soup.find(string=lambda s: s and "End of Watch:" in s)
    if data["end_of_watch"]:
        data["end_of_watch"] = data["end_of_watch"].replace("End of Watch:", "").strip()


        # Bio (main memorial narrative)
    bio_blocks = soup.find_all("p")
    long_paragraphs = [p.get_text(" ", strip=True) for p in bio_blocks if len(p.get_text(strip=True)) > 200]

    if long_paragraphs:
        data["bio"] = long_paragraphs[0]

    incident_header = soup.find("h2", string=lambda s: s and "Incident Details" in s)
    incident_text = []

    if incident_header:
        for sib in incident_header.find_next_siblings():
            if sib.name == "h2":
                break
            if sib.name == "p":
                incident_text.append(sib.get_text(" ", strip=True))

    data["incident_details"] = " ".join(incident_text) if incident_text else None


    print("Scraped:", data["name"])
    return data

In [7]:
def run():
    print("\nCollecting officer profile URLs...\n")
    officer_urls = collect_officer_urls_selenium(STATE)
    print(f"\nCollected {len(officer_urls)} officer URLs\n")

    driver = get_driver()
    results = []

    for url in officer_urls:
        try:
            officer = scrape_officer_profile(driver, url)

            if officer["name"]:
                for target in TARGET_NAMES:
                    match, score = is_fuzzy_match(officer["name"], target)

                    if match:
                        officer["matched_input_name"] = target
                        officer["fuzzy_score"] = score
                        results.append(officer)
                        print(f"âœ” Fuzzy Match ({score}): {officer['name']}  <--  {target}")
                        break


            time.sleep(1)
        except Exception as e:
            print("Error scraping:", url, e)

    driver.quit()

    if results:
        keys = results[0].keys()
        with open(OUTPUT_FILE, "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=keys)
            writer.writeheader()
            writer.writerows(results)

        print(f"\nSaved {len(results)} matched officers to {OUTPUT_FILE}")
    else:
        print("\nNo matches found.")

In [8]:
def load_names_from_file(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        names = [line.strip() for line in f if line.strip()]
    return names

In [11]:
if __name__ == "__main__":
    NAMES_FILE = "/Users/d/Capstone/california/cleaned_names.txt"
    TARGET_NAMES = load_names_from_file(NAMES_FILE)
    
    OUTPUT_FILE = "odmp_ca_officers.csv"
    run()


Collecting officer profile URLs...

Scanning browse page 1...
  Found: https://www.odmp.org/officer/27573-police-officer-alec-sanders
  Found: https://www.odmp.org/officer/27557-deputy-sheriff-andrew-nuez
  Found: https://www.odmp.org/officer/27553-police-officer-lauren-craven
  Found: https://www.odmp.org/officer/27528-police-officer-ray-barrantes
  Found: https://www.odmp.org/officer/27445-detective-victor-lemus
  Found: https://www.odmp.org/officer/27446-detective-william-osborn
  Found: https://www.odmp.org/officer/27444-detective-joshua-kelley-eklund
  Found: https://www.odmp.org/officer/27443-parole-agent-joshua-lemont-byrd
  Found: https://www.odmp.org/officer/27439-officer-miguel-cano
  Found: https://www.odmp.org/officer/27419-sergeant-shiou-deng
  Found: https://www.odmp.org/officer/27406-police-officer-samuel-riveros
  Found: https://www.odmp.org/officer/27367-police-officer-osmar-rodarte
  Found: https://www.odmp.org/officer/27354-deputy-sheriff-hector-cuevas-jr
  Found: h