In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import requests
import json
import os
from dotenv import load_dotenv
import time
from datetime import date, datetime, timedelta

In [6]:
# make a test curl request
load_dotenv() # Looks for a .env file in the current directory

api_key = os.getenv("FBI_KEY")


In [7]:
state = "UT"
url = f"https://api.usa.gov/crime/fbi/cde/agency/byStateAbbr/{state}?API_KEY={api_key}"
x = requests.get(url)
print(x.text)

{"IRON":[{"ori":"UT0110000","counties":"IRON","is_nibrs":true,"latitude":37.716972,"longitude":-113.0595,"state_abbr":"UT","state_name":"Utah","agency_name":"Iron County Sheriff's Office","agency_type_name":"County","nibrs_start_date":"2019-09-01"},{"ori":"UT0110100","counties":"IRON","is_nibrs":true,"latitude":37.677555,"longitude":-113.061646,"state_abbr":"UT","state_name":"Utah","agency_name":"Cedar City Police Department","agency_type_name":"City","nibrs_start_date":"2020-01-01"},{"ori":"UT0110300","counties":"IRON","is_nibrs":true,"latitude":37.882727,"longitude":-113.290059,"state_abbr":"UT","state_name":"Utah","agency_name":"Parowan Police Department","agency_type_name":"City","nibrs_start_date":"2019-09-01"},{"ori":"UT0110400","counties":"IRON","is_nibrs":true,"latitude":37.882727,"longitude":-113.290059,"state_abbr":"UT","state_name":"Utah","agency_name":"Southern Utah University","agency_type_name":"University or College","nibrs_start_date":"2011-10-01"},{"ori":"UT0110600","c

In [9]:
data = x.json()
rows = []
for county, agencies in data.items():
    for agency in agencies:
        agency['county'] = county
        rows.append(agency)

df = pd.DataFrame(rows)
df.head()

Unnamed: 0,ori,counties,is_nibrs,latitude,longitude,state_abbr,state_name,agency_name,agency_type_name,nibrs_start_date,county
0,UT0110000,IRON,True,37.716972,-113.0595,UT,Utah,Iron County Sheriff's Office,County,2019-09-01,IRON
1,UT0110100,IRON,True,37.677555,-113.061646,UT,Utah,Cedar City Police Department,City,2020-01-01,IRON
2,UT0110300,IRON,True,37.882727,-113.290059,UT,Utah,Parowan Police Department,City,2019-09-01,IRON
3,UT0110400,IRON,True,37.882727,-113.290059,UT,Utah,Southern Utah University,University or College,2011-10-01,IRON
4,UT0110600,IRON,True,37.882727,-113.290059,UT,Utah,Brian Head Police Department,City,2020-01-01,IRON


In [10]:
df.tail()

Unnamed: 0,ori,counties,is_nibrs,latitude,longitude,state_abbr,state_name,agency_name,agency_type_name,nibrs_start_date,county
138,UTLED0100,NOT SPECIFIED,True,40.667883,-111.92424,UT,Utah,Wildlife Resources,Other State Agency,2010-07-01,NOT SPECIFIED
139,UTUHP0000,NOT SPECIFIED,True,40.67866,-111.95802,UT,Utah,Utah Highway Patrol,State Police,2011-06-01,NOT SPECIFIED
140,UTDI01700,NOT SPECIFIED,True,40.467753,-113.123979,UT,Utah,Goshute Tribal,Tribal,2021-01-01,NOT SPECIFIED
141,UT0181200,"SALT LAKE, UTAH",True,40.525146,-111.86235,UT,Utah,Draper Police Department,City,2007-01-01,"SALT LAKE, UTAH"
142,UT0220500,"SUMMIT, WASATCH",True,40.87206,-110.968486,UT,Utah,Park City Police Department,City,2021-01-01,"SUMMIT, WASATCH"


In [47]:
with open("states.json", 'r') as f:
    states = json.load(f)
states = states['states'] # to get the list out of the json

with open("crime_abbr.json", 'r') as f:
    crimejson = json.load(f)
crime_abbrs = crimejson['offenses'] # to get the list out of the json

with open("state_names.json", 'r') as f:
    statenames = json.load(f)
statenames = statenames['states'] # to get the list out of the json
print(statenames)

['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']


In [15]:
start_time = "01-1985" # how far back the FBI data explorer goes
end_time = "10-2025" # last month

In [21]:
url = f"https://api.usa.gov/crime/fbi/cde/summarized/state/UT/V?from=08-2025&to={end_time}&API_KEY={api_key}"
# url = f"https://api.usa.gov/crime/fbi/cde/agency/byStateAbbr/{state}?API_KEY={api_key}"
response = requests.get(url)
resp_json = response.json()
rates = resp_json['offenses']['rates']

rows = []
for state, vals in rates.items():
    for date_str, rate in vals.items():
        rows.append({
            "state": state,
            "date": date_str,
            "rate": rate
        })

df = pd.DataFrame(rows)
# df["date"] = pd.to_datetime(df["date"], format="%m-%Y")
df.head()

Unnamed: 0,state,date,rate
0,Utah,08-2025,20.8
1,Utah,09-2025,18.75
2,Utah,10-2025,2.86
3,United States,08-2025,28.31
4,United States,09-2025,25.33


In [29]:
def flatten_crime_json_with_clearances(data, crime, states=None):
    """
    Flatten FBI CDE JSON into a DataFrame including state clearance rates and actuals.
    """
    # Extract blocks
    rates_all = data["offenses"]["rates"]
    actuals_all = data["offenses"]["actuals"]
    population_all = data["populations"]["population"]
    participated_all = data["populations"]["participated_population"]
    
    rows = []
    
    # Filter states: ignore US totals, keep real states
    if not states:
        states = [s for s in rates_all.keys() if "Clearances" not in s and s != "United States"]
    
    for state in states:
        rates = rates_all.get(state, {})
        actuals = actuals_all.get(state, {})
        rates_clear = rates_all.get(f"{state} Clearances", {})
        actuals_clear = actuals_all.get(f"{state} Clearances", {})
        pop = population_all.get(state, {})
        part_pop = participated_all.get(state, {})
        
        for month_year in rates.keys():
            row = {
                "state": state,
                "month_year": pd.to_datetime(month_year, format="%m-%Y"),
                f"{crime}_rate": rates.get(month_year),
                f"{crime}_actual": actuals.get(month_year),
                f"{crime}_clearance_rate": rates_clear.get(month_year),
                f"{crime}_clearance_actual": actuals_clear.get(month_year),
                "population": pop.get(month_year),
                "participated_population": part_pop.get(month_year)
            }
            rows.append(row)
    
    df = pd.DataFrame(rows)
    return df

In [30]:
df_utah = flatten_crime_json_with_clearances(resp_json, "V", states=["Utah"])
print(df_utah)

  state month_year  V_rate  V_actual  V_clearance_rate  V_clearance_actual  \
0  Utah 2025-08-01   20.80       674             11.73                 380   
1  Utah 2025-09-01   18.75       572              9.51                 290   
2  Utah 2025-10-01    2.86        15              1.33                   7   

   population  participated_population  
0     3503613                  3240938  
1     3503613                  3049993  
2     3503613                   524864  


In [31]:
df_utah.head()

Unnamed: 0,state,month_year,V_rate,V_actual,V_clearance_rate,V_clearance_actual,population,participated_population
0,Utah,2025-08-01,20.8,674,11.73,380,3503613,3240938
1,Utah,2025-09-01,18.75,572,9.51,290,3503613,3049993
2,Utah,2025-10-01,2.86,15,1.33,7,3503613,524864


In [48]:
# loop to get all the data
df = pd.DataFrame()
first = True
# states = ['UT']
# crime_abbrs = ['V', "ROB"]
for st, state_name in zip(states, statenames):
    for crime in crime_abbrs:
        url = f"https://api.usa.gov/crime/fbi/cde/summarized/state/{st}/{crime}?from={start_time}&to={end_time}&API_KEY={api_key}"
        response = requests.get(url)
        response_json = response.json()
        df_state = flatten_crime_json_with_clearances(response_json, crime, states=[state_name])
        if first:
            df = df_state.copy()
            first = False
        else:
            df_combined = pd.merge(
                df,
                df_state,
                on=["state", "month_year", "population", "participated_population"],
                how="outer"  # use outer if some months exist in one but not the other
            )
            df = df_combined.copy()

        time.sleep(5)


        

MergeError: Passing 'suffixes' which cause duplicate columns {'V_clearance_actual_x', 'V_rate_x', 'V_actual_x', 'V_clearance_rate_x'} is not allowed.

In [51]:
df = pd.DataFrame()

for st, state_name in zip(states, statenames):
    df_state = pd.DataFrame()  # will hold all crimes for this state
    
    for crime_idx, crime in enumerate(crime_abbrs):
        url = f"https://api.usa.gov/crime/fbi/cde/summarized/state/{st}/{crime}?from={start_time}&to={end_time}&API_KEY={api_key}"
        response = requests.get(url)
        response_json = response.json()
        
        # Flatten and rename columns for this crime
        df_crime = flatten_crime_json_with_clearances(response_json, crime=crime, states=[state_name])
        if crime_idx > 0:
            df_crime = df_crime.drop(columns=["population", "participated_population"])
        
        if df_state.empty:
            df_state = df_crime
        else:
            # Merge crime into the state-level DF
            df_state = pd.merge(
                df_state,
                df_crime,
                on=["state", "month_year"],  # merge only on state + month
                how="outer"
            )
        
        time.sleep(5)  # avoid hitting API too fast
    
    # Concatenate this state into global DF
    df = pd.concat([df, df_state], ignore_index=True)

# Sort final DataFrame
df = df.sort_values(["state", "month_year"]).reset_index(drop=True)


In [45]:
len(df)

490

In [46]:
df.tail()

Unnamed: 0,state,month_year,V_rate,V_actual,V_clearance_rate,V_clearance_actual,population,participated_population,ROB_rate,ROB_actual,ROB_clearance_rate,ROB_clearance_actual
485,Utah,2025-06-01,20.26,663,11.37,372,3503613,3272553,2.05,67,0.79,26
486,Utah,2025-07-01,21.14,685,11.79,382,3503613,3240713,2.13,69,1.02,33
487,Utah,2025-08-01,20.8,674,11.73,380,3503613,3240938,2.62,85,0.99,32
488,Utah,2025-09-01,18.75,572,9.51,290,3503613,3049993,2.13,65,0.62,19
489,Utah,2025-10-01,2.86,15,1.33,7,3503613,524864,0.0,0,0.0,0


In [52]:
df.head()

Unnamed: 0,state,month_year,V_rate,V_actual,V_clearance_rate,V_clearance_actual,population,participated_population,ASS_rate,ASS_actual,...,ROB_clearance_rate,ROB_clearance_actual,ARS_rate,ARS_actual,ARS_clearance_rate,ARS_clearance_actual,P_rate,P_actual,P_clearance_rate,P_clearance_actual
0,Alabama,1985-01-01,34.54,1372.0,18.76,745.0,4021000,3971935.0,22.46,892.0,...,2.79,111.0,2.44,97.0,0.38,15.0,296.58,11780.0,56.04,2226.0
1,Alabama,1985-02-01,32.81,1301.0,14.98,594.0,4021000,3965438.0,20.73,822.0,...,2.35,93.0,2.22,88.0,0.35,14.0,261.86,10384.0,50.49,2002.0
2,Alabama,1985-03-01,40.06,1580.0,17.83,703.0,4021000,3943841.0,27.87,1099.0,...,2.74,108.0,2.51,99.0,0.33,13.0,288.37,11373.0,55.43,2186.0
3,Alabama,1985-04-01,35.63,1405.0,18.62,734.0,4021000,3942841.0,24.98,985.0,...,2.23,88.0,3.04,120.0,0.46,18.0,265.32,10461.0,57.12,2252.0
4,Alabama,1985-05-01,38.23,1507.0,17.43,687.0,4021000,3942123.0,26.84,1058.0,...,2.16,85.0,2.49,98.0,0.48,19.0,281.24,11087.0,55.58,2191.0


In [53]:
len(df)

24500

In [56]:
df.to_parquet("summarized_fbi_data.parquet")