# Report Part 2: Coverage Indicators by State


This report looks at the extent to which each state is covered by NIBRS reporting. 

This report uses the "Universe" file which lists all agencies for each state and whether or not they are listed as NIBRS to identify cases where there are agencies that should be reporting. In addition, it uses the Summary Reporting Statistics (SRS) crime counts to compare with the incidents reported for each agency.

Note that in all cases, we are only looking at eligible agencies. Agency eligibility is identified using the Missing Months report data. An eligible agency has been identified to be active, not covered by a different agency, and not dormant.

In [None]:
from datetime import datetime
import os
print("Author: Automated Pipeline")
year = int(os.getenv('DATA_YEAR'))
print("Generating reports for year:",year)
print("Report date:", datetime.now().strftime("%m/%d/%y"))

In [None]:
from utils import *
from dictionaries import *
from pathlib import Path
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from IPython.display import display, Markdown
from datetime import datetime as dt

output_folder = Path(os.getenv("OUTPUT_PIPELINE_DIR"))

years = get_available_years()
output_dir = output_folder / "QC_output_files"
output_dir.mkdir(parents=True, exist_ok=True)
input_dir = output_folder / "initial_tasks_output"

engine_database = connect_to_database()

state_name_to_abbrev = {v: k for k, v in us_state_abbrev.items()}

print("--------------------------------")
print(" Loading datasets, please wait. ")

reta_frame = pd.read_csv(output_folder/"artifacts"/f"missing_months_{year}.csv")
universe_frame = pd.read_csv(input_dir/f"ref_agency_{year}.csv")

# NOTE: this should be the srs frame merged with reta-mm
srs_frame = pd.read_csv("../compute_weights/Data/srs2016_2020_smoothed.csv")

sql_query = f"""
 SELECT DISTINCT 
    ref_agency.ori,
    ref_agency.nibrs_start_date,
    ref_agency_status.data_year
   FROM ucr_prd.ref_agency_yearly ref_agency_yearly
     LEFT JOIN ucr_prd.ref_agency USING (agency_id)
     LEFT JOIN ucr_prd.ref_agency_status USING (agency_id, data_year)
  WHERE ref_agency_status.data_year IS NOT NULL AND ref_agency_yearly.is_nibrs IS TRUE
    AND ref_agency_status.data_year = {year}
"""
nibrs_frame = pd.read_sql(sql_query, engine_database)
next_year = dt(day=1,month=1,year=year+1)
nibrs_frame = nibrs_frame.loc[(pd.to_datetime(nibrs_frame["nibrs_start_date"]) < next_year)]

srs_frame = srs_frame.rename(columns={col:col.lower() for col in srs_frame.columns}).rename(columns={"ori_universe":"ori"})
universe_frame.rename(columns={col:col.lower() for col in universe_frame.columns},inplace=True)
reta_frame.rename(columns={col:col.lower() for col in reta_frame.columns},inplace=True)

# Get the total crime counts for srs
srs_frame["total_crime"] = srs_frame[["totcrime"]].sum(axis=1)

# Subset the universe by eligible agencies according to reta-mm
eligible_agencies = get_elegible_agency_list(reta_frame)

universe_frame["population"] = universe_frame["population"].astype(int)

universe_frame_el = universe_frame.loc[universe_frame["ori"].isin(eligible_agencies)]
nibrs_frame_el = nibrs_frame.loc[nibrs_frame["ori"].isin(eligible_agencies)]
srs_frame_el = srs_frame.loc[srs_frame["ori"].isin(eligible_agencies)]

# remove territories from universe
universe_frame_el = universe_frame_el.loc[universe_frame_el["state_name"].isin(us_state_abbrev.values())]

print("              done              ")
print("--------------------------------")

## Part 2A Proportion of agencies in the Universe file which were indicated to be NIBRS

Note: states in these charts are suppressed if there were fewer than 20 eligible agencies in the Universe file.

In [None]:
import plotly.graph_objects as go
import pandas as pd


uni_typei = universe_frame_el.loc[universe_frame_el["reporting_type"] == "I"]

all_counts = universe_frame_el.groupby("state_name")["ori"].count().to_dict()
typei_counts = uni_typei.groupby("state_name")["ori"].count().to_dict()


all_population = universe_frame_el.groupby("state_name")["population"].sum().to_dict()
typei_population = uni_typei.groupby("state_name")["population"].sum().to_dict()


count_frame = pd.DataFrame({"All Agencies":all_counts, 
                            "Type I Agencies":typei_counts,
                            "Total Population":all_population,
                            "Type I Total Population":typei_population}
                          ).fillna(0).astype(int)
count_frame["state"] = [state_name_to_abbrev[x] for x in count_frame.index ]
count_frame["Portion type I Agencies"] =  count_frame.apply(lambda row: None if row["All Agencies"] <= 20 else (row["Type I Agencies"] / row["All Agencies"]), axis=1).round(4)


# get the crime counts for all agencies versus type I.
count_frame["Type I Total Crimes"] = srs_frame_el.loc[srs_frame_el["ori"].isin(uni_typei["ori"].tolist())].groupby("state_name")["total_crime"].sum()
count_frame["Total Crimes"] = srs_frame_el.loc[srs_frame_el["ori"].isin(universe_frame_el["ori"].tolist())].groupby("state_name")["total_crime"].sum()
count_frame[["Type I Total Crimes","Total Crimes"]] = count_frame[["Type I Total Crimes","Total Crimes"]].fillna(0).astype(int)

count_frame["Portion Type I Crimes"] = count_frame.apply(lambda row: None if row["All Agencies"] <= 20 else \
                                                         (row["Type I Total Crimes"] / row["Total Crimes"]), axis=1).round(4)

count_frame["Portion Type I Population"] = count_frame.apply(lambda row: None if row["All Agencies"] <= 20 \
                                                             else (row["Type I Total Population"] / \
                                                                   row["Total Population"]), axis=1).round(4)



#display(count_frame)


count_frame.to_csv(output_dir / f"type_I_universe_coverage_{year}.csv")

df = count_frame

data = []
layout = dict(
    title = 'Type I (NIBRS) Coverage of the Universe of Agencies: Agencies (L), Crimes (M), and Population (R)'
)


data.append(
        dict(
            type = 'choropleth',
            geo = "geo",
            name = "Agency Counts",
            locations=df["state"],  # Spatial coordinates
            z=df["Portion type I Agencies"].astype(float),  # Data to be color-coded
            locationmode="USA-states",  # set of locations match entries in `locations`
            text="Agencies type I: " + df["Type I Agencies"].astype(str) + "/" + df["All Agencies"].astype(str),
            colorscale=[
                [0, "rgba(239,138,98, 0.85)"],
                [0.5, "rgba(230, 230, 0, 0.85)"],
                [1, "rgba(103,169,207, 0.85)"],
            ],
            zmin=0,
            zmax=1
        )
)

layout["geo"] = dict(
        scope = 'usa',
        showland = True,
        showcountries = False,
        domain = dict( x = [], y = [] ),
)

data.append(
        dict(
            type = 'choropleth',
            geo = "geo2",
            name = "Crime Counts",
            locations=df["state"],  # Spatial coordinates
            z=df["Portion Type I Crimes"].astype(float),  # Data to be color-coded
            locationmode="USA-states",  # set of locations match entries in `locations`
            text="Crimes from Type I: " + df["Type I Total Crimes"].astype(str) + "/" + df["Total Crimes"].astype(str),
            colorscale=[
                [0, "rgba(239,138,98, 0.85)"],
                [0.5, "rgba(230, 230, 0, 0.85)"],
                [1, "rgba(103,169,207, 0.85)"],
            ],
            zmin=0,
            zmax=1
        )
)

layout["geo2"] = dict(
        scope = 'usa',
        showland = True,
        showcountries = False,
        domain = dict( x = [], y = [] ),
)


data.append(
        dict(
            type = 'choropleth',
            geo = "geo3",
            name = "Population Counts",
            locations=df["state"],  # Spatial coordinates
            z=df["Portion Type I Population"].astype(float),  # Data to be color-coded
            locationmode="USA-states",  # set of locations match entries in `locations`
            text="Population from Type I: " + df["Type I Total Population"].astype(str) + "/" + df["Total Population"].astype(str),
            colorscale=[
                [0, "rgba(239,138,98, 0.85)"],
                [0.5, "rgba(230, 230, 0, 0.85)"],
                [1, "rgba(103,169,207, 0.85)"],
            ],
            zmin=0,
            zmax=1
        )
)

layout["geo3"] = dict(
        scope = 'usa',
        showland = True,
        showcountries = False,
        domain = dict( x = [], y = [] ),
)


z = 0
COLS = 3
ROWS = 1
for y in reversed(range(ROWS)):

    for x in range(COLS):
        geo_key = 'geo'+str(z+1) if z != 0 else 'geo'
        layout[geo_key]['domain']['x'] = [float(x)/float(COLS), float(x+1)/float(COLS)]
        layout[geo_key]['domain']['y'] = [float(y)/float(ROWS), float(y+1)/float(ROWS)]
        z=z+1

print("On the left: Number of agencies that were in the Universe file as type I (NIBRS)"\
      " / Number of agencies that were in the Universe File")


print("\nIn the middle: Number of SRS Crimes associated with agencies that were in the Universe file as type I"\
      " / Number of SRS Crimes associated with agencies that were in the Universe File")

print("\nOn the right: Total population that was covered by agencies that were in the Universe file as type I"\
      " / Total population that was covered by agencies that were in the Universe File")
        
fig = go.Figure(data=data, layout=layout)
fig.show()

print("Coverage outputted as file to",output_dir / f"type_I_universe_coverage_{year}.csv\n")

## Part 2B Proportion of agencies that are in NIBRS database over agencies in the universe file 

Note: states in these charts are suppressed if there were fewer than 20 eligible agencies in the Universe file.

In [None]:
import plotly.graph_objects as go
import pandas as pd

all_counts = universe_frame_el.groupby("state_name")["ori"].count().to_dict()

# get the oris that are in the universe, and the subset which are also in NIBRS
nibrs_oris = nibrs_frame_el["ori"].tolist()
nibrs_counts = universe_frame_el.loc[universe_frame_el["ori"].isin(nibrs_oris)\
                                  ].groupby("state_name")["ori"].count().to_dict()


all_population = universe_frame_el.groupby("state_name")["population"].sum().to_dict()
nibrs_population = universe_frame_el.loc[universe_frame_el["ori"].isin(nibrs_oris)\
                                      ].groupby("state_name")["population"].sum().to_dict()


count_frame2 = pd.DataFrame({"All Agencies":all_counts, 
                             "Agencies in NIBRS":nibrs_counts,
                             "Total Population":all_population,
                             "In NIBRS Total Population":nibrs_population
                            }).fillna(0).astype(int)
count_frame2["state"] = [state_name_to_abbrev[x] for x in count_frame2.index ]
count_frame2["Portion Agencies in NIBRS"] =  count_frame2.apply(lambda row: None if row["All Agencies"] <= 20 else \
                                                     (row["Agencies in NIBRS"] / row["All Agencies"]), axis=1).round(4)

# get the crime counts for all agencies versus type I.
count_frame2["In NIBRS Total Crimes"] = srs_frame_el.loc[srs_frame_el["ori"].isin(nibrs_oris)\
                                                      ].groupby("state_name")["total_crime"].sum()
count_frame2["Total Crimes"] = srs_frame_el.loc[srs_frame_el["ori"].isin(universe_frame_el["ori"].tolist())\
                                               ].groupby("state_name")["total_crime"].sum()
count_frame2[["In NIBRS Total Crimes","Total Crimes"]] = count_frame2[["In NIBRS Total Crimes","Total Crimes"]\
                                                                   ].fillna(0).astype(int)

count_frame2["Portion in NIBRS Crimes"] = count_frame2.apply(lambda row: None if row["All Agencies"] <= 20 else \
                                                           (row["In NIBRS Total Crimes"] / row["Total Crimes"]\
                                                           ), axis=1).round(4)

count_frame2["Portion in NIBRS Population"] = count_frame2.apply(lambda row: None if row["All Agencies"] <= 20 else \
                                                           (row["In NIBRS Total Population"] / row["Total Population"]\
                                                           ), axis=1).round(4)


count_frame2.to_csv(output_dir / f"universe_in_NIBRS_coverage_{year}.csv")
df = count_frame2

data = []
layout = dict(
    title = 'NIBRS Coverage of the Universe of Agencies: Agencies (L), Crimes (M), and Population (R)'
)


data.append(
        dict(
            type = 'choropleth',
            geo = "geo",
            name = "Agency Counts",
            locations=df["state"],  # Spatial coordinates
            z=df["Portion Agencies in NIBRS"].astype(float),  # Data to be color-coded
            locationmode="USA-states",  # set of locations match entries in `locations`
            text="NIBRS Agencies: " + df["Agencies in NIBRS"].astype(str) + "/" + df["All Agencies"].astype(str),
            colorscale=[
                [0, "rgba(239,138,98, 0.85)"],
                [0.5, "rgba(230, 230, 0, 0.85)"],
                [1, "rgba(103,169,207, 0.85)"],
            ],
            zmin=0,
            zmax=1
        )
)

layout["geo"] = dict(
        scope = 'usa',
        showland = True,
        showcountries = False,
        domain = dict( x = [], y = [] ),
)

data.append(
        dict(
            type = 'choropleth',
            geo = "geo2",
            name = "Crime Counts",
            locations=df["state"],  # Spatial coordinates
            z=df["Portion in NIBRS Crimes"].astype(float),  # Data to be color-coded
            locationmode="USA-states",  # set of locations match entries in `locations`
            text="NIBRS Crimes: " + df["In NIBRS Total Crimes"].astype(str) + "/" + df["Total Crimes"].astype(str),
            colorscale=[
                [0, "rgba(239,138,98, 0.85)"],
                [0.5, "rgba(230, 230, 0, 0.85)"],
                [1, "rgba(103,169,207, 0.85)"],
            ],
            zmin=0,
            zmax=1
        )
)

layout["geo2"] = dict(
        scope = 'usa',
        showland = True,
        showcountries = False,
        domain = dict( x = [], y = [] ),
)

data.append(
        dict(
            type = 'choropleth',
            geo = "geo3",
            name = "Population",
            locations=df["state"],  # Spatial coordinates
            z=df["Portion in NIBRS Population"].astype(float),  # Data to be color-coded
            locationmode="USA-states",  # set of locations match entries in `locations`
            text="NIBRS Population: " + df["In NIBRS Total Population"].astype(str) + "/" + df["Total Population"].astype(str),
            colorscale=[
                [0, "rgba(239,138,98, 0.85)"],
                [0.5, "rgba(230, 230, 0, 0.85)"],
                [1, "rgba(103,169,207, 0.85)"],
            ],
            zmin=0,
            zmax=1
        )
)

layout["geo3"] = dict(
        scope = 'usa',
        showland = True,
        showcountries = False,
        domain = dict( x = [], y = [] ),
)


z = 0
COLS = 3
ROWS = 1
for y in reversed(range(ROWS)):
    for x in range(COLS):
        geo_key = 'geo'+str(z+1) if z != 0 else 'geo'
        layout[geo_key]['domain']['x'] = [float(x)/float(COLS), float(x+1)/float(COLS)]
        layout[geo_key]['domain']['y'] = [float(y)/float(ROWS), float(y+1)/float(ROWS)]
        z=z+1

        
print("On the left: Number of agencies that were in the Universe file and the NIBRS database"\
      " / Number of agencies that were in the Universe File")

print("\nIn the middle: Number of SRS Crimes associated with agencies that were in the Universe file and the NIBRS database"\
      " / Number of SRS Crimes associated with agencies that were in the Universe File")

print("\nOn the right: Size of population associated with agencies that were in the Universe file and the NIBRS database"\
      " / Size of population associated with agencies that were in the Universe File")

fig = go.Figure(data=data, layout=layout)
fig.show()

print("Coverage outputted as file to",output_dir / f"universe_in_NIBRS_coverage_{year}.csv\n")

## Part 2C Portion of agencies with reporting type='I' in the Universe Data File which were also found in the NIBRS database

Note: states in these charts are suppressed if there were fewer than 20 type I eligible agencies in the Universe file.

In [None]:
import plotly.graph_objects as go
import pandas as pd

typei_frame = universe_frame_el.loc[universe_frame_el["reporting_type"] == "I"]
typei_in_nibrs = typei_frame.loc[typei_frame["ori"].isin(nibrs_frame_el["ori"].tolist())]


# get the oris that are in the universe of type I, and the subset which are in NIBRS
typei_counts = typei_frame.groupby("state_name")["ori"].count().to_dict()
nibrs_counts = typei_in_nibrs.groupby("state_name")["ori"].count().to_dict()


typei_population = typei_frame.groupby("state_name")["population"].sum().to_dict()
nibrs_population = typei_in_nibrs.groupby("state_name")["population"].sum().to_dict()


count_frame3 = pd.DataFrame({"Type I Agencies":typei_counts, \
                             "Type I Agencies in NIBRS":nibrs_counts,
                             "Total Type I Population":typei_population,
                             "Type I in NIBRS Total Population":nibrs_population
                            }).fillna(0).astype(int)

count_frame3["state"] = [state_name_to_abbrev[x] for x in count_frame3.index ]

count_frame3["Portion Type I Agencies in NIBRS"] =  count_frame3.apply(lambda row: None if row["Type I Agencies"] <= 20 \
                                                                     else (row["Type I Agencies in NIBRS"] / \
                                                                           row["Type I Agencies"]), axis=1).round(4)

# get the crime counts for all agencies versus type I.
count_frame3["Total Type I Crimes"] = srs_frame_el.loc[srs_frame_el["ori"].isin(typei_frame["ori"].tolist())\
                                                      ].groupby("state_name")["total_crime"].sum().fillna(0).astype(int)

count_frame3["Type I in NIBRS Total Crimes"] = srs_frame_el.loc[srs_frame_el["ori"].isin(typei_in_nibrs["ori"].tolist())\
                                                             ].groupby("state_name")["total_crime"].sum().fillna(0).astype(int)

count_frame3["Type I in NIBRS Total Crimes"] = count_frame3["Type I in NIBRS Total Crimes"].fillna(0).astype(int)

count_frame3["Portion Type I in NIBRS Crimes"] = count_frame3.apply(lambda row: None if row["Type I Agencies"] <= 20 \
                                                                  else (row["Type I in NIBRS Total Crimes"] / \
                                                                        row["Total Type I Crimes"]), axis=1).round(4)



count_frame3["Portion Type I in NIBRS Population"] = count_frame3.apply(lambda row: None if row["Type I Agencies"] <= 20 \
                                                                  else (row["Type I in NIBRS Total Population"] / \
                                                                        row["Total Type I Population"]), axis=1).round(4)


count_frame3.to_csv(output_dir / f"type_I_in_NIBRS_coverage_{year}.csv")

df = count_frame3

data = []
layout = dict(
    title = 'NIBRS Coverage of the Type I Agencies in the Universe: Agencies (L), Crimes (M), and Population (R)'
)


data.append(
        dict(
            type = 'choropleth',
            geo = "geo",
            name = "Agency Counts",
            locations=df["state"],  # Spatial coordinates
            z=df["Portion Type I Agencies in NIBRS"].astype(float),  # Data to be color-coded
            locationmode="USA-states",  # set of locations match entries in `locations`
            text="NIBRS Agencies: " + df["Type I Agencies in NIBRS"].astype(str) + "/" + df["Type I Agencies"].astype(str),
            colorscale=[
                [0, "rgba(239,138,98, 0.85)"],
                [0.5, "rgba(230, 230, 0, 0.85)"],
                [1, "rgba(103,169,207, 0.85)"],
            ],
            zmin=0,
            zmax=1
        )
)

layout["geo"] = dict(
        scope = 'usa',
        showland = True,
        showcountries = False,
        domain = dict( x = [], y = [] ),
)

data.append(
        dict(
            type = 'choropleth',
            geo = "geo2",
            name = "Crime Counts",
            locations=df["state"],  # Spatial coordinates
            z=df["Portion Type I in NIBRS Crimes"].astype(float),  # Data to be color-coded
            locationmode="USA-states",  # set of locations match entries in `locations`
            text="NIBRS Crimes: " + df["Type I in NIBRS Total Crimes"].astype(str) + "/" + df["Total Type I Crimes"].astype(str),
            colorscale=[
                [0, "rgba(239,138,98, 0.85)"],
                [0.5, "rgba(230, 230, 0, 0.85)"],
                [1, "rgba(103,169,207, 0.85)"],
            ],
            zmin=0,
            zmax=1
        )
)

layout["geo2"] = dict(
        scope = 'usa',
        showland = True,
        showcountries = False,
        domain = dict( x = [], y = [] ),
)


data.append(
        dict(
            type = 'choropleth',
            geo = "geo3",
            name = "Population",
            locations=df["state"],  # Spatial coordinates
            z=df["Portion Type I in NIBRS Population"].astype(float),  # Data to be color-coded
            locationmode="USA-states",  # set of locations match entries in `locations`
            text="NIBRS Population: " + df["Type I in NIBRS Total Population"].astype(str) + "/" + df["Total Type I Population"].astype(str),
            colorscale=[
                [0, "rgba(239,138,98, 0.85)"],
                [0.5, "rgba(230, 230, 0, 0.85)"],
                [1, "rgba(103,169,207, 0.85)"],
            ],
            zmin=0,
            zmax=1
        )
)

layout["geo3"] = dict(
        scope = 'usa',
        showland = True,
        showcountries = False,
        domain = dict( x = [], y = [] ),
)


z = 0
COLS = 3
ROWS = 1
for y in reversed(range(ROWS)):
    for x in range(COLS):
        geo_key = 'geo'+str(z+1) if z != 0 else 'geo'
        layout[geo_key]['domain']['x'] = [float(x)/float(COLS), float(x+1)/float(COLS)]
        layout[geo_key]['domain']['y'] = [float(y)/float(ROWS), float(y+1)/float(ROWS)]
        z=z+1

print("On the left: Number of agencies that were type I in the Universe file and the NIBRS database"\
      " / Number of agencies that were type I in the Universe File")

print("\nIn the middle: Number of SRS Crimes associated with agencies that were type I in the Universe file and the NIBRS database"\
      " / Number of SRS Crimes associated with agencies that were type I in the Universe File")

print("\nOn the right: Population associated with agencies that were type I in the Universe file and the NIBRS database"\
      " / Population associated with agencies that were type I in the Universe File")
        
fig = go.Figure(data=data, layout=layout)
fig.show()

print("Coverage outputted as file to",output_dir / f"type_I_in_NIBRS_coverage_{year}.csv\n")

## Datasets Used:

`Missing months datafile`: missing_months_<year>.csv (reta missing months)
* **Source**: NIBRS database
* **Description**: All law enforcement agencies in the US, whether or not they should be reporting crimes, and what months they reported incidents. Lists eligible agencies and whether or not they reported for different months.
* **Typical data**: 23 columns of ORI, state, status flags, population information, and indicators for if they reported crimes for each month.


`Universe datafile`: ref_agency_year.xlsx
* **Source**: FBI CJIS
* **Description**: Annual Snapshot List of all agencies and meta-data, regardless of NIBRS reporting status.
* **Typical data**: 66 columns of ORI, population region and officer meta-data. This includes both NIBRS and non-NIBRS agencies.
   * Agency Population loaded from column (POPULATION)
   * Agency Officer Count loaded from columns (PE_MALE_OFFICER_COUNT + PE_FEMALE_OFFICER_COUNT)

`SRS datafile (historic)`: srs2016_2020_smoothed.csv
* **Source**: FBI CJIS
* **Description**: Summary Reporting System (SRS) Crime data smoothed across four years.
* **Typical data**: Several hundred columns of crime counts by month/category. For NIBRS agencies, the SRS crime counts should reflect the subset of incidents reported to NIBRS which are relevant.
   * SRS incident count is sum of all monthly total columns (v95,v213,v331,v449,v567,v685,v803,v921,v1039,v1157,v1275,v1393)

`NIBRS datafile`: Amazon Web Services database
* **Source**: FBI CJIS
* **Description**: Incident/Offender/Victim dataset of crimes published by FBI
* **Typical data**: Incident level data can be retrieved in various ways (e.g. incident, victim, offender, or agency centric viewpoints)
   * Eligible ORIs selected from reta-mm have
     * AGENCY_STATUS is 'Active' or 'Federal' (reject 'LEOKA', blanks)
     * COVERED_FLAG is 'N' (reject 'Y')
     * DORMANT_FLAG is 'N' (reject 'Y')