# Arrests/charges stemming from the Jan. 6 riot at the U.S. Capitol

### Import Python tools

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import geopandas as gpd
import jenkspy
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup

In [3]:
%matplotlib inline
import json
import numpy as np
import altair as alt
import altair_latimes as lat

In [4]:
alt.themes.register("latimes", lat.theme)
alt.themes.enable("latimes")

ThemeRegistry.enable('latimes')

In [5]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
alt.data_transformers.disable_max_rows()
pd.set_option("display.max_colwidth", None)

### Local Reports

In [6]:
# https://www.uscp.gov/media-center/weekly-arrest-summary

In [7]:
# https://github.com/wpinvestigative/us_capitol_police_reports

In [8]:
# https://mpdc.dc.gov/page/may-2020-january-2021-unrest-related-arrests-and-persons-interest

In [9]:
# https://mpdc.dc.gov/sites/default/files/dc/sites/mpdc/publication/attachments/Unrest-Related%20Arrest%20Data%20as%20of%20January%207%202021.pdf

---

### Justice Department list

In [10]:
justice_url = "https://www.justice.gov/opa/investigations-regarding-violence-capitol"

In [11]:
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
}
response_justice = requests.get(justice_url, headers=headers)

### Create a dataframe with the html table, and clean up the headers

In [12]:
justice = pd.read_html(response_justice.text, attrs={"class": "tablesaw"})[0]
strings = [",", "/", "(", ")", "_"]

In [13]:
justice.columns = (
    justice.columns.str.strip()
    .str.lower()
    .str.replace(" ", "_", regex=False)
    .str.replace(":", "", regex=False)
    .str.replace("/", "_", regex=False)
    .str.replace(",", "_", regex=False)
)

In [14]:
# justice.rename(columns=lambda x: x.replace(" ", "_").lower(), inplace=True)

### Make a copy of the dataframe so we can improve the original

In [15]:
justice_df = pd.DataFrame(justice.rename(columns={"name": "fullname"}).copy())

In [16]:
justice_df["fullname"] = justice_df["fullname"].str.replace(
    "BETANCUR, Bryan (aka Bryan Clooney, aka Maximo Clooney)",
    "BETANCUR, Bryan",
    regex=False,
)

In [17]:
justice_df.tail()

Unnamed: 0,case_number,fullname,charge(s),associated_documents,location_of_arrest,case_status,entry_last_updated*
250,1:21-mj-229,"WILSON, Zachary John",Knowingly entering or remaining in any restricted building,Wilson - Complaint,,,"March 1, 2021"
251,1:21-mj-160,"WINN, Dana Joe",Unlawful Entry on Restricted Building or Grounds; Violent entry and disorderly conduct on Capitol Grounds,Pert & Winn - Complaint & Statement of Facts,"Florida, Middle District",Arrested 1/26/21 in the Middle District of Florida. Initial appearance 1/26/21 at 2:30pm. Released on Conditions.,"February 4, 2021"
252,1:21-mj-26,"WRIGLEY, Andrew","Entering and Remaining in a Restricted Building; Disorderly and Disruptive Conduct in a Restricted Building; Violent Entry and Disorderly Conduct in a Capitol Building; Parading, Demonstrating, or Picketing in a Capitol Building",Wrigley - Complaint Wrigley - Statement of Facts Wrigley - Information,"Pennsylvania, Middle District",Arrested on 1/15/21 in the Middle District of Pennsylvania. Initial appearance via video on 1/25/21 in federal court in the District of Columbia.,"March 1, 2021"
253,,"YOUNG, Graydon",Conspiracy; Obstruction of an Official Proceeding and Aiding and Abetting; Destruction of Government Property and Aiding and Abetting; Entering and Remaining in a Restricted Building or Grounds; Tampering with Documents or Proceedings,"Meggs Steele Young - Complaint & Affidavit Caldwell, et al – Indictment","Florida, Middle District",Arrested 2/15/21 in the Middle District of Florida,"February 19, 2021"
254,1:21-mj-193,"ZINK, Ryan Scott",Obstruction of an Official Proceeding; Knowingly Entering or Remaining in any Restricted Building or Grounds Without Lawful Authority and engages in any act of physical violence against any person or property in any restricted building or grounds.,Zink - Complaint & Statement of Facts,Pennsylvania,,"March 1, 2021"


In [18]:
justice_df["fullname"] = justice_df["fullname"].str.strip()

In [19]:
justice_df = justice_df[
    [
        "case_number",
        "fullname",
        "location_of_arrest",
        "charge(s)",
        "associated_documents",
        "case_status",
        "entry_last_updated*",
    ]
]

### Get out HTML table so we can parse it

In [20]:
response_justice = requests.get(justice_url)
soup = BeautifulSoup(response_justice.text, "html.parser")
table = soup.find("table")

### Get the defendant document links from the table into a list

In [21]:
# links = []
# for tr in table.findAll("tr")[1:]:
#     trs = tr.findAll("td")

#     this_row_links = []

#     for link in trs[3].findAll("a"):
#         this_row_links.append(link["href"])

#     links.append(this_row_links)

In [22]:
# justice_df["links"] = links

### Get the link descriptions from the table into a list

In [23]:
# press_releases = []
# for tr in table.findAll("tr")[1:]:
#     trs = tr.findAll("td")

#     this_row_text = []

#     for p in trs[3].findAll("p"):
#         this_row_text.append(p)

#     press_releases.append(this_row_text)

In [24]:
# justice_df["press_links"] = press_releases

### We no longer need this column

In [25]:
# justice_df.drop(["links_to_press_releases__charging_documents"], axis=1, inplace=True)

### How's the dataframe look? 

In [26]:
justice_df.head()

Unnamed: 0,case_number,fullname,location_of_arrest,charge(s),associated_documents,case_status,entry_last_updated*
0,1:21-cr-93,"ANDRIES, John D.",Maryland,Knowlingly Entering or Remaining in any Restricted Building or Grounds Without Lawful Authority; Violent Entry and Disorderly Conduct on Capitol Grounds,"Andries - Complaint, Statement of Facts, & Information",,"February 27, 2021"
1,1:21-cr-43,"ABUAL-RAGHEB, Rasha N.",New Jersey,"Entering and Remaining in a Restricted Building; Disorderly and Disruptive Conduct in a Restricted Building; Violent Entry and Disorderly Conduct in a Capitol Building; Parading, Demonstrating, or Picketing in a Capitol Building",Abual-Ragheb - Complaint Abual-Ragheb - Affidavit Abual-Ragheb - Indictment,Arrested 1/19/21 in the District of New Jersey,"February 4, 2021"
2,1:21-cr-84,"ADAMS, Daniel Page","Texas, Eastern District","Civil Disorder; Obstruction of an Official Proceeding; Assaulting, Resisting or Impeding Certain Officers; Entering and Remaining in a Restricted Building or Grounds; Disorderly and Disruptive Conduct in a Restricted Building or Grounds; Disorderly Conduct in a Capitol Building; Impeding Passage Through the Capitol Grounds or Buildings; Parading, Demonstrating, or Picketing in a Capitol Building","Adams Connell - Complaint Adams Connell - Statement of Facts Adams, Connell - Amended Statement of Facts Adams, Connell - Amended Complaint Adams, Connell - Indictment",Arrested 1/16/21 in the Eastern District of Texas,"February 10, 2021"
3,1:21-mj-165,"ALAM, Zachary Jordan","Pennsylvania, Eastern District","Assault on a Federal Officer with a Dangerous or Deadly Weapon; Destruction of Government Property over $1,000; Obstruction of an Official Proceeding; Unlawful Entry on Restricted Building or Grounds; Violent Entry and Disorderly Conduct","Alam, Zachary - Complaint & Affidavit",Arrested 1/30/21 in the Eastern District of Pennsylvania,"February 4, 2021"
4,1:21-cr-26,"ALBERTS, Christopher Michael",Maryland,Unlawful Possession of a Firearm on Capitol Grounds or Buildings; Unlawful Entry or Remaining on Restricted Grounds without Lawful Authoirty; Carrying a Pistol without a License Outside Home or Place of Business; Possession of a Large Capacity Ammunition Feeding Device,"Alberts - Complaint Alberts - Statement of Facts Alberts, Christopher - Indictment",Arrested on 1/7/21. Initial appearance / detention hearing on 1/7/21. Defendant released. Preliminary Hearing set for 1/28/21 at 2:00 p.m.,"February 4, 2021"


In [27]:
phrases = [
    ", Middle District",
    ", Southern District",
    ", Central District",
    ", Western District",
]

In [28]:
justice_df.to_json("processed/justice_df.json", orient="records", indent=2)

In [29]:
len(justice_df)

255

---

### Ingrest data from the Program on Extremism at GW

In [30]:
# https://extremism.gwu.edu/Capitol-Hill-Cases

In [31]:
!wget 'https://extremism.gwu.edu/sites/g/files/zaxdzs2191/f/CapitolHillTracker11521.xlsx' --directory-prefix="raw/" --output-document="raw/latest_extremism.xlsx"

--2021-03-16 07:38:32--  https://extremism.gwu.edu/sites/g/files/zaxdzs2191/f/CapitolHillTracker11521.xlsx
Resolving extremism.gwu.edu (extremism.gwu.edu)... 2606:4700::6812:825, 2606:4700::6812:925, 104.18.8.37, ...
Connecting to extremism.gwu.edu (extremism.gwu.edu)|2606:4700::6812:825|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11830 (12K) [application/vnd.openxmlformats-officedocument.spreadsheetml.sheet]
Saving to: ‘raw/latest_extremism.xlsx’


2021-03-16 07:38:34 (4.20 MB/s) - ‘raw/latest_extremism.xlsx’ saved [11830/11830]



In [32]:
gw_df = pd.read_excel("raw/latest_extremism.xlsx", skiprows=1).dropna(how="all", axis=1)

In [33]:
gw_df.columns = (
    gw_df.columns.str.strip()
    .str.lower()
    .str.replace(" ", "_", regex=False)
    .str.replace(":", "", regex=False)
    .str.replace("/", "_", regex=False)
    .str.replace(",", "_", regex=False)
)

In [34]:
gw_df["datecharged"] = (pd.to_datetime(gw_df["datecharged"]).dt.date).fillna("")

In [35]:
gw_df.head(10)

Unnamed: 0,name,agearrest,gender,state,datecharged
0,"ALBERTS, Christopher Michael",33,M,MD,2021-01-07
1,"BARANYI, Thomas",28,M,NJ,2021-01-10
2,"BARNETT, Richard",60,M,AR,2021-01-07
3,"BAUER, Robert",Unknown,M,KY,
4,"BLACK, Joshua Matthew",44,M,AL,2021-01-13
5,"BLEDSOE, Matthew",Unknown,M,Unknown,
6,"BROCK, Larry Rendell",54,M,TX,2021-01-09
7,"BROWN, Terry",69,M,PA,2021-01-07
8,"CAMARGO, Samuel",Unknown,M,FL,2021-01-15
9,"CHANSLEY, Jacob Anthony",33,M,AZ,2021-01-11


In [36]:
gw_df.name = gw_df.name.str.strip()

In [37]:
len(gw_df)

58

In [38]:
gw_df.state.value_counts()

VA         7
Unknown    6
NY         5
TX         5
FL         5
CO         2
PA         2
AL         2
IL         2
MD         2
AR         2
DE         2
ID         1
HI         1
NH         1
OH         1
WI         1
KY         1
WA         1
DC         1
WV         1
MO         1
LA         1
NJ         1
UT         1
IA         1
TN         1
AZ         1
Name: state, dtype: int64

### Merge with Justice data

In [39]:
# justice_gw_merge = pd.merge(
#     justice_df, gw_df, right_on="name", left_on="fullname", how="right"
# )

In [40]:
# len(justice_gw_merge)

In [41]:
# justice_gw_merge.head()

---

### Read data from the [Prosecution Project](https://theprosecutionproject.org/2021/01/14/prosecution-data-capitol-siege-of-january-6-2021/)

In [42]:
url2 = "https://docs.google.com/spreadsheets/d/e/2PACX-1vQ-NJiMr9_MVxsqTSB1sYkzOZSfg59m6ViR7qvjXef3O4txMuWYxh7TlTVcQAxzduCjhLxKP3dlXUhX/pub?output=csv"

In [43]:
cases = pd.read_csv(url2, dtype={"age": int})

In [44]:
cases.columns = (
    cases.columns.str.strip()
    .str.lower()
    .str.replace(" ", "_", regex=False)
    .str.replace(":", "", regex=False)
    .str.replace("/", "_", regex=False)
    .str.replace("-", "_", regex=False)
)

In [45]:
cases = cases.dropna(how="all")

In [46]:
cases["charges"] = cases["charges"].str.upper()

In [47]:
cases[["full_legal_name", "charges"]].head()

Unnamed: 0,full_legal_name,charges
0,Henry Tarrio,DC CODE 7:2506.01(B) POSSESSION OF LARGE CAPACITY AMMUNITION FEEDING DEVICE [2 COUNTS]
1,Matthew Ross Council,18:1752(A)(1) KNOWINGLY ENTERING OR REMAINING IN ANY RESTRICTED BUILDING OR GROUNDS WITHOUT LAWFUL AUTHORITY; 40:5104(E)(2) VIOLENT ENTRY AND DISORDERLY CONDUCT ON CAPITOL GROUNDS
2,Kash Lee Kelly,18:1752(A)(1) AND (2); 40:5104(E)(2)(A) AND (G)
3,Thomas Alexander Gronek,22 DC CODE 4504(A-1) CARRYING A RIFLE OR SHOTGUN (OUTSIDE HOME OR PLACE OF BUSINESS
4,Harlan Timothy Boen,22 DC CODE 4504(A) CARRYING A PISTOL WITHOUT A LICENSE (OUTSIDE HOME OR PLACE OF BUSINESS)


### Mean age of all arrestees related to the Capitol seige

In [48]:
cases.dtypes

date                   object
date_descriptor        object
case_id                object
group_identifier       object
full_legal_name        object
first_name             object
family_name            object
other_names_aliases    object
co_offenders           object
name_of_case           object
jurisdiction           object
location_country       object
location_state         object
location_city          object
charges                object
additional_details     object
age                    object
gender                 object
racial_ethnic_group    object
short_narrative        object
dtype: object

In [49]:
cases["age"] = cases["age"].fillna("0").str.replace("Bledsoe", "0", regex=False)

In [50]:
cases["age"] = cases["age"].astype(int)

ValueError: invalid literal for int() with base 10: '#'

In [None]:
cases["age"].mean()

### Number of cases so far

In [None]:
len(cases)

### Gender counts for those charged

In [None]:
round(cases.gender.value_counts("normalize") * 100)

### Race/ethnicity of those charged?

In [None]:
round(cases.racial_ethnic_group.value_counts("normalize") * 100)

In [None]:
cases_race = (
    cases.groupby(["racial_ethnic_group"]).agg({"case_id": "size"}).reset_index()
).rename(columns={"case_id": "count"})

In [None]:
alt.Chart(cases_race).mark_bar().encode(
    x=alt.X("racial_ethnic_group:N", title="Race/ethnicity"),
    y=alt.Y("count:Q", title="Count"),
).properties(width=800, height=400, title="Cases by race/ethnicity")

---

### Cases that only involve illegally entering the Capitol

In [None]:
capitol_entry_cases = cases[
    cases["charges"].fillna("").str.contains("RESTRICTED BUILDING OR GROUNDS")
]

In [None]:
len(capitol_entry_cases)

In [None]:
round(capitol_entry_cases.racial_ethnic_group.value_counts("normalize") * 100)

In [None]:
round(capitol_entry_cases.location_state.value_counts("normalize") * 100)

In [None]:
round(capitol_entry_cases.gender.value_counts("normalize") * 100)

---

### Export

In [None]:
cases.to_csv("raw/cases_prosecution_project.csv", index=False)