# Slain police officers in the US

> This notebook scrapes and processes an unofficial directory of more than 25,000 line-of-duty deaths among American police officers since 1900 from the [Officer Down Memorial Page](https://www.odmp.org/). 

---

## Config

#### Python tools

In [1]:
import re
import requests
import numpy as np
import pandas as pd
import jupyter_black
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm

#### Jupyter options

In [2]:
jupyter_black.load()
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None

#### Dates

In [3]:
today = pd.Timestamp("today").strftime("%Y-%m-%d")
start_year = 1900
current_year = int(pd.Timestamp("today").strftime("%Y"))
years = [y for y in range(start_year, current_year + 1)]

#### Headers

In [4]:
headers = {
    "accept": "*/*",
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",
}

---

## Fetch

#### Loop through year, extracting officer details into a list of dictionaries

In [5]:
officer_data = []

for year in tqdm(years):
    url = f"https://www.odmp.org/search/year/{year}"
    response = requests.get(url, headers=headers)
    html_content = BeautifulSoup(response.text, "html.parser")

    articles = html_content.find_all("article", class_="officer-profile-condensed")

    for article in articles:
        # Get the officer's profile URL
        officer_page_url = article.find("a")["href"]

        # Get the officer's photo URL
        officer_image_url = article.find("img")["src"] if article.find("img") else None

        # Get the details
        details = article.find("div", class_="officer-short-details")
        if details:
            detail_text = details.find_all("p")

            name = detail_text[0].text if len(detail_text) > 0 else None
            agency = detail_text[1].text if len(detail_text) > 1 else None
            eow_date = (
                detail_text[2].text.replace("EOW: ", "")
                if len(detail_text) > 2
                else None
            )
            cause = (
                detail_text[3].text.replace("Cause: ", "")
                if len(detail_text) > 3
                else None
            )

            # Store the data
            officer_data.append(
                {
                    "name": name,
                    "url": officer_page_url,
                    "photo_url": officer_image_url,
                    "agency": agency,
                    "eow": eow_date,
                    "cause": cause,
                }
            )

  0%|          | 0/125 [00:00<?, ?it/s]

#### Convert list to a Pandas DataFrame

In [6]:
df = pd.DataFrame(officer_data)

#### How many?

In [7]:
len(df)

25015

---

## Process

#### Split the department name and location

In [8]:
df[["department_name", "state_abbreviation"]] = df["agency"].str.rsplit(
    ", ", n=1, expand=True
)

#### Process the end-of-watch dates

In [9]:
df["eow_date"] = pd.to_datetime(df["eow"].str.replace("EOW: ", ""))
df["eow_year"] = df["eow_date"].dt.year
df["eow_weekday"] = df["eow_date"].dt.day_name()

#### Clean up stray characters

In [10]:
df["name"] = df["name"].str.strip()
df["department_name"] = df["department_name"].str.strip()

In [11]:
df["police_dog"] = df["name"].str.contains("K9")

#### Read sample officer titles list to help split names/titles

In [12]:
with open("titles.txt", "r") as file:
    titles = [line.strip() for line in file]

#### Split the names/titles

In [13]:
# Create a regex pattern to match the titles
pattern = r"\b(" + "|".join(titles) + r")\b"

# Extract the title using the pattern
df["title"] = df["name"].str.extract(pattern)

# Replace the title in the name with an empty string and strip any leading/trailing spaces
df["officer_name"] = df["name"].str.replace(pattern, "", regex=True).str.strip()

#### Keep the columns we want, in the order we want

In [14]:
keep = [
    "officer_name",
    "title",
    "department_name",
    "state_abbreviation",
    "cause",
    "eow_date",
    "eow_year",
    "eow_weekday",
    "police_dog",
    "url",
    "photo_url",
]

In [15]:
officers_df = df.drop(["name", "eow", "agency"], axis=1)[keep].copy()

---

## Export

#### CSV

In [16]:
officers_df.to_csv("data/processed/us_slain_police_officers_1900_2024.csv", index=False)

#### JSON

In [17]:
officers_df.to_json(
    "data/processed/us_slain_police_officers_1900_2024.json", indent=4, orient="records"
)