# FBI background checks

### Import Python tools and Jupyter configuration

In [1]:
%load_ext lab_black

In [2]:
import tabula

In [3]:
import pandas as pd
import geopandas as gpd
import altair as alt
import datetime as dt
import numpy as np

In [4]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None

In [5]:
month_year_updated = dt.date.today().strftime("%m_%Y")

---

In [6]:
url = "https://www.fbi.gov/file-repository/nics_firearm_checks_-_month_year.pdf"

### Download the latest version of the FBI PDF

In [7]:
!wget -O data/raw/latest_{month_year_updated}.pdf {url} --quiet

### Read PDF file and use stream argument to keep rows/columns in place

In [8]:
table = tabula.read_pdf(
    f"data/raw/latest_{month_year_updated}.pdf", stream=True, pages=1
)

### Make a dataframe from the first (and, in this case, only) table in the list

In [9]:
src = table[0].drop([0, 24])

### Clean up

In [10]:
src.fillna(0, inplace=True)

In [11]:
src.columns = src.columns.str.lower()

In [12]:
src["year"] = src["year"].astype(str)
src["year"] = src["year"].astype(str)

In [13]:
src.replace(",", "", regex=True, inplace=True)

In [14]:
src[
    [
        "jan",
        "feb",
        "mar",
        "apr",
        "may",
        "jun",
        "jul",
        "aug",
        "sep",
        "oct",
        "nov",
        "dec",
        "totals",
    ]
] = src[
    [
        "jan",
        "feb",
        "mar",
        "apr",
        "may",
        "jun",
        "jul",
        "aug",
        "sep",
        "oct",
        "nov",
        "dec",
        "totals",
    ]
].astype(
    int
)

### Copy the dataframe

In [15]:
df = src.copy()

---

### How many per year?

In [16]:
annual = df[["year", "totals"]].sort_values("year", ascending=False)

In [17]:
annual.head()

Unnamed: 0,year,totals
23,2021,35778134
22,2020,39695315
21,2019,28369750
20,2018,26181936
19,2017,25235215


In [18]:
months = df[
    ["jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"]
]

In [19]:
for m in months:
    df[f"{m}_share"] = ((df[f"{m}"] / df["totals"]) * 100).round(2)

In [20]:
df_share = df[
    [
        "year",
        "jan_share",
        "feb_share",
        "mar_share",
        "apr_share",
        "may_share",
        "jun_share",
        "jul_share",
        "aug_share",
        "sep_share",
        "oct_share",
        "nov_share",
        "dec_share",
    ]
]

In [21]:
df_share.round(0).style.background_gradient(cmap="Oranges").set_precision(0)

Unnamed: 0,year,jan_share,feb_share,mar_share,apr_share,may_share,jun_share,jul_share,aug_share,sep_share,oct_share,nov_share,dec_share
1,1999,6,8,8,7,6,6,6,8,9,10,11,14
2,2000,7,8,9,7,6,6,6,8,9,10,11,12
3,2001,7,8,8,7,6,6,6,8,10,12,11,12
4,2002,8,8,8,7,7,6,6,8,9,10,10,12
5,2003,8,8,9,7,7,6,6,8,9,10,10,12
6,2004,8,8,8,7,6,6,6,8,9,10,10,12
7,2005,8,8,9,7,6,6,6,8,9,10,10,13
8,2006,8,8,8,7,6,6,6,8,9,10,10,12
9,2007,8,8,9,8,7,7,7,8,8,9,10,11
10,2008,7,8,8,7,7,6,7,8,8,9,12,12


In [22]:
df_share.style.background_gradient(cmap="Oranges", axis=None).format("{:.4}")

Unnamed: 0,year,jan_share,feb_share,mar_share,apr_share,may_share,jun_share,jul_share,aug_share,sep_share,oct_share,nov_share,dec_share
1,1999,6.47,7.62,8.24,7.08,6.31,6.23,6.45,7.7,8.85,10.35,10.99,13.72
2,2000,7.49,8.28,8.62,7.23,6.31,6.44,6.35,7.99,9.15,9.9,10.52,11.72
3,2001,7.19,7.58,8.19,6.67,6.1,6.07,6.05,7.94,9.7,11.56,11.03,11.93
4,2002,7.88,8.22,8.45,7.43,6.73,6.13,6.34,8.2,8.57,10.05,10.5,11.52
5,2003,7.71,8.35,8.69,7.34,6.69,6.24,6.29,8.06,8.71,10.1,9.94,11.89
6,2004,8.0,8.33,8.5,7.4,6.24,6.29,6.47,7.67,8.52,9.97,10.25,12.36
7,2005,7.66,8.3,8.58,7.36,6.22,6.21,6.27,7.67,8.84,9.52,10.36,13.01
8,2006,7.73,8.18,8.42,6.98,6.24,6.14,6.29,8.3,9.16,9.66,10.41,12.49
9,2007,8.0,8.19,8.73,7.52,7.18,7.09,6.78,8.21,8.45,9.17,9.66,11.01
10,2008,7.42,8.03,8.19,7.4,6.97,6.45,7.01,7.53,7.66,9.31,12.04,11.99


In [23]:
df_share.to_csv("data/processed/df_share.csv", index=False)

---

### Normalize 

In [24]:
pop = pd.read_csv("data/raw/population.csv", dtype={"year": str})

In [25]:
pop.replace(",", "", regex=True, inplace=True)

In [26]:
pop["population"] = pop["population"].astype(int)

In [27]:
norm_df = pd.merge(annual, pop, on="year")

In [28]:
norm_df["per_thousand"] = ((norm_df["totals"] / norm_df["population"]) * 1000).round()

In [29]:
norm_df.head()

Unnamed: 0,year,totals,population,per_thousand
0,2021,35778134,332992659,107.0
1,2020,39695315,329484123,120.0
2,2019,28369750,328329953,86.0
3,2018,26181936,326838199,80.0
4,2017,25235215,325122128,78.0


In [30]:
alt.Chart(norm_df).mark_bar().encode(
    x=alt.X("year", axis=alt.Axis(tickCount=9), title="Year"),
    y=alt.Y("per_thousand", axis=alt.Axis(tickCount=5, title="Rate")),
).properties(width=650, title="FBI firearm background checks per 1,000 Americans")

---

### Export

In [31]:
norm_df.to_csv("data/processed/fbi_annual_background_checks.csv", index=False)