# DOJ arrests/charges data from the Jan. 6 riot at the U.S. Capitol

### Import Python tools and Jupyter configuration

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re

In [3]:
import altair as alt
import altair_latimes as lat
import numpy as np

In [4]:
alt.themes.register("latimes", lat.theme)
alt.themes.enable("stiles")

ThemeRegistry.enable('latimes')

In [5]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
alt.data_transformers.disable_max_rows()
pd.options.display.max_colwidth = None

### Justice Department list

In [6]:
justice_url = "https://www.justice.gov/opa/investigations-regarding-violence-capitol"

In [7]:
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
}
response_justice = requests.get(justice_url, headers=headers)

### Create a dataframe with the html table, and clean up the headers

In [8]:
justice = pd.read_html(response_justice.text, attrs={"class": "tablesaw"})[0]
strings = [",", "/", "(", ")", "_"]

### Clean up the headers

In [9]:
justice.columns = (
    justice.columns.str.strip()
    .str.lower()
    .str.replace(" ", "_", regex=False)
    .str.replace(":", "", regex=False)
    .str.replace("/", "_", regex=False)
    .str.replace(",", "_", regex=False)
    .str.replace("*", "", regex=False)
    .str.replace("(s)", "s", regex=False)
)

### Make a copy of the dataframe so we can improve the original

In [10]:
doj = pd.DataFrame(justice.rename(columns={"name": "fullname"}).copy())

In [11]:
doj["fullname"] = doj["fullname"].str.replace(
    "BETANCUR, Bryan (aka Bryan Clooney, aka Maximo Clooney)",
    "BETANCUR, Bryan",
    regex=False,
)

In [12]:
doj["fullname"] = doj["fullname"].str.strip()

In [13]:
doj = doj[
    [
        "case_number",
        "fullname",
        "location_of_arrest",
        "charges",
        "associated_documents",
        "case_status",
        "entry_last_updated",
    ]
]

### Get out HTML table so we can parse it

In [14]:
response_justice = requests.get(justice_url)
soup = BeautifulSoup(response_justice.text, "html.parser")
table = soup.find("table")

### Clean up locations

In [15]:
doj["location_of_arrest"] = (
    doj["location_of_arrest"]
    .str.strip()
    .str.title()
    .str.replace(", Middle District", "", regex=False)
    .str.replace(", Southern District", "", regex=False)
    .str.replace(", Central District", "", regex=False)
    .str.replace(", Western District", "", regex=False)
    .str.replace(", Eastern District", "", regex=False)
    .str.replace(", Northern District", "", regex=False)
)

In [16]:
doj["location_of_arrest"] = doj["location_of_arrest"].fillna("Not listed")

In [17]:
new = doj["location_of_arrest"].str.split(", ", n=1, expand=True)
doj["location_of_arrest_state"] = new[0]
doj["location_of_arrest_other"] = new[1]
doj.drop(columns=["location_of_arrest"], inplace=True)

### Get the defendant document links from the table into a list

In [18]:
links = []

for tr in table.findAll("tr")[1:]:
    trs = tr.findAll("td")

    this_row_links = []

    for link in trs[3].findAll("a"):
        this_row_links.append("https://www.justice.gov" + link["href"])

    links.append(this_row_links)

In [19]:
doj["links"] = links

In [20]:
links = doj["links"].apply(pd.Series).fillna("")

In [21]:
df = pd.merge(doj, links, left_index=True, right_index=True)

In [22]:
# df.drop(["links"], axis=1, inplace=True)

In [23]:
df.rename(
    columns={
        0: "link_0",
        1: "link_1",
        2: "link_2",
        3: "link_3",
        4: "link_4",
        5: "link_5",
    },
    inplace=True,
)

---

### Parse the defendant names

In [24]:
df["fullname"] = df["fullname"].str.replace(" (aka", ", (aka", regex=False)

In [25]:
df[["last_name", "first_name", "rest", "rest2", "rest3"]] = (
    df["fullname"].str.title().str.strip().str.split(", ", expand=True).fillna("")
)

In [26]:
df[["first_name", "middle_name", "other_name"]] = (
    df["first_name"].str.title().str.split(" ", expand=True)
)

In [27]:
df.drop(["rest", "rest2", "other_name"], axis=1, inplace=True)

In [28]:
df = df[
    [
        "case_number",
        "fullname",
        "first_name",
        "middle_name",
        "last_name",
        "charges",
        "case_status",
        "entry_last_updated",
        "location_of_arrest_state",
        "location_of_arrest_other",
        "associated_documents",
        "link_0",
        "link_1",
        "link_2",
        "link_3",
        "link_4",
        "link_5",
        "links",
    ]
]

---

### How's the dataframe look? 

In [29]:
df.head(1)

Unnamed: 0,case_number,fullname,first_name,middle_name,last_name,charges,case_status,entry_last_updated,location_of_arrest_state,location_of_arrest_other,associated_documents,link_0,link_1,link_2,link_3,link_4,link_5,links
0,1:21-cr-212,"ADAMS, Jared Hunter",Jared,Hunter,Adams,"Entering and Remaining in a Restricted Building; Disorderly and Disruptive Conduct in a Restricted Building; Violent Entry and Disorderly Conduct in a Capitol Building; Parading, Demonstrating, or Picketing in a Capitol Building","Arrest date & location: 3/9/2021 in Hilliard, Ohio","March 22, 2021",Ohio,Hilliard,Adams & Jared - Information Adams & Jared - Statement of Facts Adams & Jared - Complaint,https://www.justice.gov/usao-dc/case-multi-defendant/file/1378326/download,https://www.justice.gov/usao-dc/case-multi-defendant/file/1378331/download,https://www.justice.gov/usao-dc/case-multi-defendant/file/1378336/download,,,,"[https://www.justice.gov/usao-dc/case-multi-defendant/file/1378326/download, https://www.justice.gov/usao-dc/case-multi-defendant/file/1378331/download, https://www.justice.gov/usao-dc/case-multi-defendant/file/1378336/download]"


### How many cases?

In [30]:
len(df)

362

### Arrest locations

In [31]:
df.location_of_arrest_state.value_counts().head(10)

Not listed      46
Florida         33
Pennsylvania    33
Texas           30
New York        24
Virginia        17
Ohio            17
California      13
New Jersey      12
Missouri        11
Name: location_of_arrest_state, dtype: int64

### California cases

In [32]:
df_ca = df[df["location_of_arrest_state"] == "California"]

In [33]:
df_ca[["fullname", "case_status", "entry_last_updated", "charges"]]

Unnamed: 0,fullname,case_status,entry_last_updated,charges
7,"ALLAN, Tommy Frederick",Arrested 1/22/21 in the Eastern District of California,"February 10, 2021","Theft of Government Property; Entering and Remaining in a Restricted Building or Grounds; Disorderly and Disruptive Conduct in a Restricted Building or Grounds; Entering and Remaining on the Floor of Congress; Disorderly Conduct in a Capitol Building; Parading, Demonstrating, or Picketing in a Capitol Building"
30,"BISIGNANO, Gina Michelle",Arrested 1/19/21 in the Central District of California Ordered detained. Indicted 1/29/21.,"February 4, 2021",Obstruction of an Official Proceeding; Aiding and Abetting; Civil Disorder; Destruction of Government Property; Entering and Remaining in a Restricted Building or Grounds; Disorderly and Disruptive Conduct in a Restricted Building or Grounds; Engaging in Physical Violence in a Restricted Building or Grounds; Disorderly Conduct in a Capitol Building
68,"CORDON, Kevin Francisco","Arrested on 03/09/2021 in Alhambra, California","April 7, 2021","Obstruction of an Official Proceeding; Entering and Remaining in a Restricted Building or Grounds; Disorderly and Disruptive Conduct in a Restricted Building or Grounds; Disorderly Conduct in a Capitol Building; Parading, Demonstrating, or Picketing in a Capitol Building"
86,"EHMKE, Hunter Allen",Arrested 1/13/21 in California. Initial Appearance in federal court in the Central District of California 1/14/21 at 1pm PST. Defendant was released on bond with home detention and electronic home monitoring. Initial appearance in federal court in the District of Columbia scheduled for 1/21/21 at 1pm EST.,"February 4, 2021","Destruction of Government Property; Obstruction of an Official Proceeding; Disorderly Conduct in a Capitol Building; Parading, Demonstrating, or Picketing in a Capitol Building"
87,"EHRKE, Valerie Elaine",Arrested 1/19/21 in the Eastern District of California,"February 10, 2021","Entering and Remaining in a Restricted Building; Disorderly and Disruptive Conduct in a Restricted Building; Violent Entry and Disorderly Conduct in a Capitol Building; Parading, Demonstrating, or Picketing in a Capitol Building"
109,"GOLD, Simone Melissa",Arrested 1/18/21 in the Central District of California Released on conditions. Virtual hearing in federal court in the District of Columbia on 1/21/21 at 1:00pm.,"February 8, 2021",Restricted Building or Grounds; Violent Entry and Disorderly Conduct
177,"LEWIS, Jacob",Arrested 1/27/2021 in the Central District of California. Initial appearance 1/27/21 at 2:00pm PST. Released on conditions. Virtual initial appearance in federal court in the District of Columbia on 2/10/21 at 1:00pm,"February 10, 2021","Entering and Remaining in a Restricted Building; Disorderly and Disruptive Conduct in a Restricted Building; Violent Entry and Disorderly Conduct in a Capitol Building; Parading, Demonstrating, or Picketing in a Capitol Building"
262,"RILEY, Jorge A.",Arrested 1/19/21 in the Eastern District of California,"February 10, 2021","Obstructing of an Official Proceeding; Aiding and Aiding; Entering and Remaining in a Restricted Building or Grounds; Disorderly and Disruptive Conduct in a Restricted Building or Grounds; Disorderly Conduct in a Capitol Building; Parading, Demonstrating, or Picketing in a Capitol Building"
288,"SECOR, Christian",Arrested 2/16/21 in the Central District of California,"March 24, 2021","Obstruction of an Official Proceeding; Civil Disorder; Assaulting, Resisting, or Impeding Certain Officers; Entering and Remaining in a Restricted Building or Grounds; Disorderly and Disruptive Conduct in a Restricted Building or Grounds; Entering and Remaining on the Floor of Congress; Entering and Remaining in the Gallery of Congress; Entering and Remaining in Certain Rooms in the Capitol Building; Disorderly Conduct in a Capitol Building; Parading, Demonstrating, or Picketing in a Capitol Building"
298,"SIMON, Mark",Arrested 1/28/21 in the Central District of California. Initial appearance 1/28/21 at 2:00pm PST in the Central District of California. Released on bond. Next hearing (virtual) on 2/3/21 in federal court in the District of Columbia.,"March 19, 2021","Entering and Remaining in a Restricted Building or Grounds; Disorderly and Disruptive Conduct in a Restricted Building or Grounds; Disorderly Conduct in a Capitol Building or Grounds; Parading, Demonstrating, or Picketing in a Capitol Building"


In [34]:
len(df_ca)

13

---

### Parse and categorize each defendant's list of charges

In [35]:
df["charges_list"] = df["charges"].str.upper().str.split(";")

### Create a flat table from the one-to-many relationship between defendants and charges

In [36]:
df_long = df.explode("charges_list")

### Charge strings and categories

In [37]:
data_list = {
    "charge": [
        "INTERSTATE",
        "VIOLENCE",
        "PARADING, DEMONSTRATING, OR PICKETING",
        "PARADE, DEMONSTRATE, OR PICKET",
        "ENTERING",
        "DISORDERLY CONDUCT",
        "DISORDERLY AND DISRUPTIVE CONDUCT",
        "OBSTRUCTION OF AN OFFICIAL",
        "CIVIL DISORDER",
        "OBSTRUCTION OF AN OFFICIAL",
        "KNOWINGLY ENTERING OR REMAINING",
        "VIOLENT ENTRY",
        "DISORDERLY AND DISRUPTIVE CONDUCT",
        "REMAINING IN A RESTRICTED BUILDING",
        "VIOLENT ENTRY",
        "AIDING AND ABETTING",
        "PHYSICAL VIOLENCE",
        "CONSPIRACY",
        "ASSAULTING, RESISTING, OR IMPEDING",
        "RESTRICTED BUILDING OR GROUNDS",
        "ENTERING AND REMAINING IN A RESTRICTED BUILDING OR GROUNDS WITH A DEADLY OR DANGEROUS WEAPON",
        "ASSAULTING, RESISTING, OR IMPEDING CERTAIN OFFICERS USING A DANGEROUS WEAPON",
        "IMPEDING PASSAGE THROUGH THE CAPITOL GROUNDS OR BUILDINGS",
        "TAMPERING",
        "THEFT",
        "OBSTRUCTION",
        "DESTRUCTION",
        "DEMONSTRATING",
        "DISRUPTIVE CONDUCT",
        "PROPERTY",
        "ASSAULTING",
        "UNLAWFUL ENTRY",
        "IMPEDING",
        "PISTOL",
        "FIREARM",
        "OBSTRUCTING",
        "UNLAWFUL ACTIVITIES ON CAPITOL GROUNDS",
        "INTERFERED WITH A FEDERAL AGENT",
        "UNLAWFUL POSSESSION",
        "BODILY INJURY",
        "THREATENING A FEDERAL OFFICER",
        "ASSAULT",
        "PARADING",
        "DISORDERLLY CONDUCT",
        "DI$ORDERLY CONDUCT",
        "OBSTRUCT, IMPEDE, OR INTERFERE",
        "DISRUPTING THE ORDERLY CONDUCT",
        "AMMUNITION",
        "DI$ORDERLY",
        "DISORDELRY CONDUCT",
        "OFFICIAL PROCEEDING",
        "AIDING AND AIDING",
    ],
    "category": [
        "Other",
        "Other",
        "Demonstrating in the Capitol",
        "Demonstrating in the Capitol",
        "Entering/remaining in resricted area",
        "Disruptive/disorderly conduct",
        "Disruptive/disorderly conduct",
        "Obstructing a proceeding",
        "Disruptive/disorderly conduct",
        "Obstructing a proceeding",
        "Entering/remaining in resricted area",
        "Violent entry",
        "Disruptive/disorderly conduct",
        "Entering/remaining in resricted area",
        "Violent entry",
        "Aiding and abetting",
        "Physical violence in the Capitol",
        "Conspiracy",
        "Assaulting/resisting/impeding officers",
        "Entering/remaining in resricted area",
        "Weapons charge",
        "Assaulting/resisting/impeding officers w/ weapon",
        "Impeding passage in the Capitol",
        "Tampering",
        "Theft",
        "Obstructing a proceeding",
        "Destroying property",
        "Demonstrating in the Capitol",
        "Disruptive/disorderly conduct",
        "Property damage",
        "Assaulting/resisting/impeding officers",
        "Entering/remaining in resricted area",
        "Assaulting/resisting/impeding officers",
        "Weapons charge",
        "Weapons charge",
        "Obstructing a proceeding",
        "Unlawful activities on Capitol grounds",
        "Assaulting/resisting/impeding officers",
        "Weapons charge",
        "Assaulting/resisting/impeding officers",
        "Assaulting/resisting/impeding officers",
        "Assaulting/resisting/impeding officers",
        "Demonstrating in the Capitol",
        "Disruptive/disorderly conduct",
        "Disruptive/disorderly conduct",
        "Assaulting/resisting/impeding officers",
        "Disruptive/disorderly conduct",
        "Weapons charge",
        "Disruptive/disorderly conduct",
        "Disruptive/disorderly conduct",
        "Obstructing a proceeding",
        "Aiding and abetting",
    ],
}

### Loop and assign categories

In [38]:
for charge, category in zip(data_list["charge"], data_list["category"]):
    df_long.loc[
        df_long["charges_list"].str.contains(f"{charge}", case=False), "category"
    ] = f"{category}"

---

### How many charges does each defendant face? 

In [39]:
cases_by_defendant = (
    df_long.groupby(["case_number", "fullname"])
    .agg({"charges_list": "count"})
    .reset_index()
).sort_values("charges_list", ascending=False)

In [40]:
cases_by_defendant.sort_values("charges_list", ascending=False).head()

Unnamed: 0,case_number,fullname,charges_list
13,1:21-cr-107,"CUA, Bruno Joseph",12
26,1:21-cr-120,"FAIRLAMB, Scott Kevin",11
31,1:21-cr-127,"BLACK, Joshua Matthew",10
22,1:21-cr-117,"NICHOLS, Ryan",10
110,1:21-cr-206,"MELLIS, Jonathan Gennaro, (aka, Jon Gennaro)",10


### Which charges are most common (many imperfect strings here)?

In [41]:
charges_by_charges = (
    df_long.groupby(
        [
            "charges_list",
        ]
    )
    .agg({"case_number": "count"})
    .reset_index()
).sort_values("case_number", ascending=False)

In [42]:
charges_by_charges.sort_values("case_number", ascending=False).head(10)

Unnamed: 0,charges_list,case_number
140,"PARADING, DEMONSTRATING, OR PICKETING IN A CAPITOL BUILDING",119
49,DISORDERLY CONDUCT IN A CAPITOL BUILDING,93
41,DISORDERLY AND DISRUPTIVE CONDUCT IN A RESTRICTED BUILDING OR GROUNDS,91
80,ENTERING AND REMAINING IN A RESTRICTED BUILDING OR GROUNDS,72
205,ENTERING AND REMAINING IN A RESTRICTED BUILDING,48
39,DISORDERLY AND DISRUPTIVE CONDUCT IN A RESTRICTED BUILDING,45
247,OBSTRUCTION OF AN OFFICIAL PROCEEDING,43
175,VIOLENT ENTRY AND DISORDERLY CONDUCT IN A CAPITOL BUILDING,38
179,VIOLENT ENTRY AND DISORDERLY CONDUCT ON CAPITOL GROUNDS,38
123,OBSTRUCTION OF AN OFFICIAL PROCEEDING,38


### Which categories of charges are most common? 

In [43]:
charges_by_category = (
    df_long.groupby(
        [
            "category",
        ]
    )
    .agg({"case_number": "count"})
    .reset_index()
).sort_values("case_number", ascending=False)

In [44]:
charges_by_category.sort_values("case_number", ascending=False).head(10)

Unnamed: 0,category,case_number
4,Disruptive/disorderly conduct,442
5,Entering/remaining in resricted area,368
6,Obstructing a proceeding,184
3,Demonstrating in the Capitol,182
12,Violent entry,112
1,Assaulting/resisting/impeding officers,110
9,Property damage,60
8,Physical violence in the Capitol,41
13,Weapons charge,33
0,Aiding and abetting,29


In [45]:
charges_by_category.case_number.sum()

1608

---

### Parse dates from the "case status" column

In [46]:
df_long_new = df_long.copy()

In [47]:
df_long_new["case_status_helper"] = df_long_new["case_status"].fillna("").astype(str)

### Remove alphabetical characters to expose dates in sentences

In [48]:
df_long_new["arrested_date"] = (
    df_long_new["case_status_helper"]
    .str.replace("April ", "04/", regex=False)
    .str.replace("March ", "03/", regex=False)
    .str.replace("January ", "01/", regex=False)
    .str.replace("February ", "01/", regex=False)
    .str.replace(", 2021 ", "/21", regex=False)
    .str.replace("[^0-9,-,/.]", "", regex=True)
)

In [61]:
df_long_new["arrested_date"].value_counts()

                                                   380
1/19/21                                             56
1/18/21                                             44
2/11/21                                             30
1/26/21                                             30
2/1/21                                              22
1/27/21                                             22
03/09/2021,                                         21
1/25/21                                             20
02/19/2021,                                         18
02/25/2021                                          18
2/4/21                                              16
1/15/21                                             15
1/17/21.                                            15
03/04/2021,                                         14
03/12/2021,                                         14
1/20/21                                             13
2/6/21                                              12
.         

In [62]:
df_long_new["arrested_date"] = (
    df_long_new["arrested_date"]
    .str.replace(",", "|", regex=False)
    .str.replace("/21", "/21|", regex=False)
    .str.replace(".", "|", regex=False)
)

In [63]:
df_long_new[
    ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13"]
] = df_long_new["arrested_date"].str.split("|", expand=True)

In [64]:
df_long_new["arrested_date"] = df_long_new["arrested_date"] + ("/21")

In [65]:
df_long_new["arrested_date"].head()

0                3/9/2021|/21
0                3/9/2021|/21
0                3/9/2021|/21
0                3/9/2021|/21
1    2/12/21||2/12/21|200|/21
Name: arrested_date, dtype: object

In [66]:
df_long_new.drop(
    [
        "1",
        "2",
        "3",
        "4",
        "5",
        "6",
        "7",
        "8",
        "9",
        "10",
        "11",
        "12",
        "13",
        "arrested_date",
    ],
    axis=1,
    inplace=True,
)

In [67]:
df_long_new.rename(
    columns={"charge": "all_charges", "charges_list": "charge", "0": "arrest_date"},
    inplace=True,
)

In [68]:
df_long_new["arrest_date"] = (
    df_long_new["arrest_date"]
    .str.replace("1/19/20212/11/2021200", "1/19/2021", regex=False)
    .str.replace("2021", "21", regex=False)
)

In [69]:
df_long_new["arrest_date"] = df_long_new["arrest_date"].str.strip()

In [70]:
df_clean = df_long_new[
    [
        "case_number",
        "arrest_date",
        "fullname",
        "first_name",
        "middle_name",
        "last_name",
        "charge",
        "case_status",
        "entry_last_updated",
        "location_of_arrest_state",
        "location_of_arrest_other",
        "associated_documents",
        "links",
    ]
]

In [72]:
df_clean["arrest_date"].value_counts()

              417
1/19/21        90
1/18/21        70
1/26/21        49
1/16/21        44
2/4/21         43
1/15/21        43
1/20/21        41
1/14/21        41
2/11/21        40
1/22/21        40
1/28/21        35
1/17/21        33
03/09/21       33
1/25/21        33
1/27/21        30
1/21           27
2/1/21         25
02/25/21       24
03/04/21       23
03/17/21       23
1/12/21        22
1/7/21         22
1/13/21        19
03/12/21       18
02/19/21       18
1/29/21        17
03/18/21       14
1/30/21        13
2/6/21         12
3/15/21        11
03/05/21       10
1/8/21         10
2/18/21        10
2/16/21        10
02/18/21       10
2/12/21         8
03/11/21        8
1/12            8
2/25/21         8
02/12/21        7
2/22/21         7
04/6/21         7
2/19/21         7
41/15/21        7
03/24           6
3/10/21         6
2/10/21         6
03/16/21        5
2/3/21          5
04/7            5
2/9/21          5
2/17/21         5
2/15/21         5
02/10/21        5
02/05/21  

## Export

### All cases

In [73]:
df_clean.to_json("processed/cases.json", orient="records", indent=2)

In [74]:
df_clean.to_csv("processed/cases.csv", index=False)

### California cases

In [75]:
df_clean[df_clean["location_of_arrest_state"] == "California"].to_json(
    "processed/california-cases.json", orient="records", indent=2
)

In [76]:
df_clean[df_clean["location_of_arrest_state"] == "California"].to_csv(
    "processed/california-cases.csv", index=False
)

### Charges by category

In [77]:
charges_by_category.to_csv("processed/charges_by_category.csv", index=False)

In [78]:
cases_by_defendant.to_csv("processed/cases_by_defendant.csv", index=False)

In [79]:
charges_by_charges.to_csv("processed/charges_by_charges.csv", index=False)