# Counting U.S. Capitol breach cases listed by DOJ via Wayback Machine

### Import Python tools and Jupyter configuration

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re

In [3]:
import altair as alt
import altair_latimes as lat
import numpy as np

In [4]:
alt.themes.register("latimes", lat.theme)
alt.themes.enable("latimes")

ThemeRegistry.enable('latimes')

In [5]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
alt.data_transformers.disable_max_rows()
pd.options.display.max_colwidth = None

### Headers

In [6]:
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
}

### Get the wayback machine feed of changes

In [7]:
url = "http://web.archive.org/cdx/search/cdx?url=https://www.justice.gov/usao-dc/capitol-breach-cases&output=json"

In [8]:
urls = pd.read_json(url, orient="records").drop([0], axis=0)

In [9]:
urls.columns = [
    "urlkey",
    "timestamp",
    "original",
    "mimetype",
    "statuscode",
    "digest",
    "length",
]

In [10]:
urls["datetime"] = pd.to_datetime(urls["timestamp"])

In [11]:
urls["date"] = urls["datetime"].dt.date

### Limit the update urls to the last one of each day

In [12]:
daily_urls = urls.sort_values(["datetime", "date"], ascending=True).drop_duplicates(
    "date", keep="last"
)

In [13]:
urls_success = daily_urls[daily_urls["statuscode"] == "200"].copy()

In [14]:
len(urls_success)

65

### Loop through urls and create Wayback create dataframes

In [15]:
# Sample url: 'https://web.archive.org/web/20210401183845/https://www.justice.gov/usao-dc/capitol-breach-cases'

In [16]:
content = []
pages = []

for t, u in zip(urls_success.timestamp, urls_success.original):
    response_justice = requests.get(
        "https://web.archive.org/web/" + t + "/" + u, headers=headers
    )
    #     justice_url = "https://web.archive.org/web/" + t + "/" + u
    pages.append(
        (pd.read_html(response_justice.text, attrs={"class": "tablesaw"})[0]).assign(
            timestamp=t
        )
    )

### Create a large dataframe from a list of update date dataframes, and add a timestamp

In [17]:
df = pd.concat(pages)

In [18]:
df.drop(["Entry Last Updated*"], axis=1, inplace=True)

### Clean up dates

In [19]:
df["datetime"] = pd.to_datetime(df["timestamp"])

In [20]:
df["date"] = df["datetime"].dt.date

In [21]:
df.drop(["timestamp"], axis=1, inplace=True)

In [22]:
df.dtypes

Case Number                     object
Name                            object
Charge(s)                       object
Associated Documents            object
Location of Arrest              object
Case Status                     object
Entry Last Updated              object
datetime                datetime64[ns]
date                            object
dtype: object

### Clean up the headers

In [23]:
df.columns = (
    df.columns.str.strip()
    .str.lower()
    .str.replace(" ", "_", regex=False)
    .str.replace(":", "", regex=False)
    .str.replace("/", "_", regex=False)
    .str.replace(",", "_", regex=False)
    .str.replace("*", "", regex=False)
    .str.replace("(s)", "s", regex=False)
)

In [24]:
df.to_csv("raw/cases-internet-archive-all.csv", index=False)

---

### How many cases each day?

In [25]:
cases_by_day = df.groupby(["datetime"]).agg({"case_number": "size"}).reset_index()

In [26]:
cases_by_day.dtypes

datetime       datetime64[ns]
case_number             int64
dtype: object

In [27]:
area_chart = (
    alt.Chart(cases_by_day)
    .mark_area(opacity=0.2)
    .encode(
        x=alt.X(
            "datetime",
            title="",
            axis=alt.Axis(format="%b. %-d", tickCount=10, grid=False),
        ),
        y=alt.Y(
            "case_number",
            title=" ",
            stack=None,
            axis=alt.Axis(
                tickSize=0,
                domainOpacity=0,
                tickCount=6,
                offset=4,
                gridWidth=0.6,
                gridColor="#dddddd",
            ),
            scale=alt.Scale(domain=(100, 400)),
        ),
    )
)

line_chart = (
    alt.Chart(cases_by_day)
    .mark_line()
    .encode(
        x=alt.X(
            "datetime",
            title="",
            axis=alt.Axis(format="%b. %-d", tickCount=10, grid=False),
        ),
        y=alt.Y(
            "case_number",
            title=" ",
            stack=None,
            axis=alt.Axis(
                tickSize=0,
                domainOpacity=0,
                tickCount=6,
                offset=4,
                gridWidth=0.6,
                gridColor="#dddddd",
            ),
            scale=alt.Scale(domain=(100, 400)),
        ),
    )
)


chart_defendants_area = (
    (line_chart + area_chart)
    .properties(
        width=800,
        height=600,
        title="U.S. Capitol breach defendants, by day",
    )
    .configure_view(strokeOpacity=0)
    .configure_legend(orient="top", symbolType="square")
    .configure_axis(labelFontSize=13)
)

chart_defendants_area

In [28]:
chart_defendants_area.save("capitol_defendants_timeseries.png", scale_factor=2)

---

In [29]:
df.head()

Unnamed: 0,case_number,name,charges,associated_documents,location_of_arrest,case_status,entry_last_updated,datetime,date
0,1:21-mj-186,"SCHWARTZ, Peter","Forcibly Assaulting, Resisting, or Impeding Certain Officers or Employees; Knowingly Entering or Remaining in any Restricted Building or Grounds Without Lawful Authority; Violent Entry and Disorderly Conduct on Capitol Grounds","Schwartz, Peter - Complaint & Statement of Facts","Pennsylvania, Western District",Arrested 2/4/21 in the Western District of Pennsylvania,"February 4, 2021",2021-02-04 22:22:04,2021-02-04
1,1:21-mj-178,"GRACE, Jeffrey",Knowingly Entering or Remaining in any Restricted Building or Grounds Without Lawful Authority,"Grace, Jeffrey - Complaint & Statement of Facts",Oregon,Arrested 2/4/21 in Oregon,"February 4, 2021",2021-02-04 22:22:04,2021-02-04
2,1:21-mj-179,"SCHWAB, Katherine Staveley (aka, Katie)",Knowingly Entering or Remaining in any Restricted Building or Grounds Without Lawful Authority; Disorderly Conduct on Capitol Grounds,"Schwab, Katherine - Complaint & Statement of Facts","Texas, Northern District",Arrested 2/1/21 in the Northern District of Texas,"February 4, 2021",2021-02-04 22:22:04,2021-02-04
3,1:21-mj-181,"Hyland, Jason Lee",Knowingly Entering or Remaining in any Restricted Building or Grounds Without Lawful Authority; Disorderly Conduct on Capitol Grounds,"Hyland, Jason - Complaint & Statement of Facts","Texas, Eastern District",Arrested 2/2/21 in the Eastern District of Texas,"February 4, 2021",2021-02-04 22:22:04,2021-02-04
4,1:21-cr-53,"LANG, Edward Jacob","Civil Disorder; Assaulting, Resisting, or Impeding Certain Officers; Assaultin, Resisting, or Impeding Certain Officers Using a Dangerous Weapon; Obstruction of an Official Proceeding; Aiding and Abetting; Disorderly and Disruptive Conduct in a Restricted Building or Grounds with a Deadly or Dangerous Weapon; Disorderly Conduct in a Capitol Building; Act of Physical Violence in the Capitol Grounds or Buildings",Lang - Affidavit in Support Lang - Complaint Lang - Indictment,New York,Arrested on 1/16/21 in New York.,"February 4, 2021",2021-02-04 22:22:04,2021-02-04


In [30]:
df.shape

(16580, 9)

---

### Parse and categorize each defendant's list of charges

In [31]:
df["charges_list"] = df["charges"].str.upper().str.split(";")

### Create a flat table from the one-to-many relationship between defendants and charges

In [32]:
df_long = df.explode("charges_list")

In [33]:
charges_by_day = df_long.groupby(["date"]).agg({"charges_list": "count"}).reset_index()

### Charge strings and categories

In [34]:
data_list = {
    "charge": [
        "INTERSTATE",
        "VIOLENCE",
        "PARADING, DEMONSTRATING, OR PICKETING",
        "PARADE, DEMONSTRATE, OR PICKET",
        "ENTERING",
        "DISORDERLY CONDUCT",
        "DISORDERLY AND DISRUPTIVE CONDUCT",
        "OBSTRUCTION OF AN OFFICIAL",
        "CIVIL DISORDER",
        "OBSTRUCTION OF AN OFFICIAL",
        "KNOWINGLY ENTERING OR REMAINING",
        "VIOLENT ENTRY",
        "DISORDERLY AND DISRUPTIVE CONDUCT",
        "REMAINING IN A RESTRICTED BUILDING",
        "VIOLENT ENTRY",
        "AIDING AND ABETTING",
        "PHYSICAL VIOLENCE",
        "CONSPIRACY",
        "ASSAULTING, RESISTING, OR IMPEDING",
        "RESTRICTED BUILDING OR GROUNDS",
        "ENTERING AND REMAINING IN A RESTRICTED BUILDING OR GROUNDS WITH A DEADLY OR DANGEROUS WEAPON",
        "ASSAULTING, RESISTING, OR IMPEDING CERTAIN OFFICERS USING A DANGEROUS WEAPON",
        "IMPEDING PASSAGE THROUGH THE CAPITOL GROUNDS OR BUILDINGS",
        "TAMPERING",
        "THEFT",
        "OBSTRUCTION",
        "DESTRUCTION",
        "DEMONSTRATING",
        "DISRUPTIVE CONDUCT",
        "PROPERTY",
        "ASSAULTING",
        "UNLAWFUL ENTRY",
        "IMPEDING",
        "PISTOL",
        "FIREARM",
        "OBSTRUCTING",
        "UNLAWFUL ACTIVITIES ON CAPITOL GROUNDS",
        "INTERFERED WITH A FEDERAL AGENT",
        "UNLAWFUL POSSESSION",
        "BODILY INJURY",
        "THREATENING A FEDERAL OFFICER",
        "ASSAULT",
        "PARADING",
        "DISORDERLLY CONDUCT",
        "DI$ORDERLY CONDUCT",
        "OBSTRUCT, IMPEDE, OR INTERFERE",
        "DISRUPTING THE ORDERLY CONDUCT",
        "AMMUNITION",
        "DI$ORDERLY",
        "DISORDELRY CONDUCT",
        "OFFICIAL PROCEEDING",
        "AIDING AND AIDING",
    ],
    "category": [
        "Other",
        "Violent entry",
        "Demonstrating in the Capitol",
        "Demonstrating in the Capitol",
        "Entering restricted area",
        "Disruptive/disorderly",
        "Disruptive/disorderly",
        "Obstructing a proceeding",
        "Disruptive/disorderly",
        "Obstructing a proceeding",
        "Entering restricted area",
        "Violent entry",
        "Disruptive/disorderly",
        "Entering restricted area",
        "Violent entry",
        "Aiding and abetting",
        "Physical violence",
        "Conspiracy",
        "Assaulting/resisting/impeding",
        "Entering restricted area",
        "Weapons charge",
        "Assault/resist/impede officer",
        "Impeding passage in the Capitol",
        "Tampering",
        "Theft",
        "Obstructing a proceeding",
        "Destroying property",
        "Demonstrating in the Capitol",
        "Disruptive/disorderly",
        "Property damage",
        "Assault/resist/impede officer",
        "Entering restricted area",
        "Assault/resist/impede officer",
        "Weapons charge",
        "Weapons charge",
        "Obstructing a proceeding",
        "Unlawful activities",
        "Assault/resist/impede officer",
        "Weapons charge",
        "Assault/resist/impede officer",
        "Assault/resist/impede officer",
        "Assault/resist/impede officer",
        "Demonstrating in the Capitol",
        "Disruptive/disorderly",
        "Disruptive/disorderly",
        "Assault/resist/impede officer",
        "Disruptive/disorderly",
        "Weapons charge",
        "Disruptive/disorderly",
        "Disruptive/disorderly",
        "Obstructing a proceeding",
        "Aiding and abetting",
    ],
}

### Loop and assign categories

In [35]:
for charge, category in zip(data_list["charge"], data_list["category"]):
    df_long.loc[
        df_long["charges_list"].fillna("").str.contains(f"{charge}", case=False),
        "category",
    ] = f"{category}"

In [36]:
df_long.head()

Unnamed: 0,case_number,name,charges,associated_documents,location_of_arrest,case_status,entry_last_updated,datetime,date,charges_list,category
0,1:21-mj-186,"SCHWARTZ, Peter","Forcibly Assaulting, Resisting, or Impeding Certain Officers or Employees; Knowingly Entering or Remaining in any Restricted Building or Grounds Without Lawful Authority; Violent Entry and Disorderly Conduct on Capitol Grounds","Schwartz, Peter - Complaint & Statement of Facts","Pennsylvania, Western District",Arrested 2/4/21 in the Western District of Pennsylvania,"February 4, 2021",2021-02-04 22:22:04,2021-02-04,"FORCIBLY ASSAULTING, RESISTING, OR IMPEDING CERTAIN OFFICERS OR EMPLOYEES",Assault/resist/impede officer
0,1:21-mj-186,"SCHWARTZ, Peter","Forcibly Assaulting, Resisting, or Impeding Certain Officers or Employees; Knowingly Entering or Remaining in any Restricted Building or Grounds Without Lawful Authority; Violent Entry and Disorderly Conduct on Capitol Grounds","Schwartz, Peter - Complaint & Statement of Facts","Pennsylvania, Western District",Arrested 2/4/21 in the Western District of Pennsylvania,"February 4, 2021",2021-02-04 22:22:04,2021-02-04,KNOWINGLY ENTERING OR REMAINING IN ANY RESTRICTED BUILDING OR GROUNDS WITHOUT LAWFUL AUTHORITY,Entering restricted area
0,1:21-mj-186,"SCHWARTZ, Peter","Forcibly Assaulting, Resisting, or Impeding Certain Officers or Employees; Knowingly Entering or Remaining in any Restricted Building or Grounds Without Lawful Authority; Violent Entry and Disorderly Conduct on Capitol Grounds","Schwartz, Peter - Complaint & Statement of Facts","Pennsylvania, Western District",Arrested 2/4/21 in the Western District of Pennsylvania,"February 4, 2021",2021-02-04 22:22:04,2021-02-04,VIOLENT ENTRY AND DISORDERLY CONDUCT ON CAPITOL GROUNDS,Violent entry
1,1:21-mj-178,"GRACE, Jeffrey",Knowingly Entering or Remaining in any Restricted Building or Grounds Without Lawful Authority,"Grace, Jeffrey - Complaint & Statement of Facts",Oregon,Arrested 2/4/21 in Oregon,"February 4, 2021",2021-02-04 22:22:04,2021-02-04,KNOWINGLY ENTERING OR REMAINING IN ANY RESTRICTED BUILDING OR GROUNDS WITHOUT LAWFUL AUTHORITY,Entering restricted area
2,1:21-mj-179,"SCHWAB, Katherine Staveley (aka, Katie)",Knowingly Entering or Remaining in any Restricted Building or Grounds Without Lawful Authority; Disorderly Conduct on Capitol Grounds,"Schwab, Katherine - Complaint & Statement of Facts","Texas, Northern District",Arrested 2/1/21 in the Northern District of Texas,"February 4, 2021",2021-02-04 22:22:04,2021-02-04,KNOWINGLY ENTERING OR REMAINING IN ANY RESTRICTED BUILDING OR GROUNDS WITHOUT LAWFUL AUTHORITY,Entering restricted area


--- 

### Chart the number of charges over time

In [37]:
charges_by_day.date = pd.to_datetime(charges_by_day.date)

In [38]:
charges_by_day.dtypes

date            datetime64[ns]
charges_list             int64
dtype: object

In [39]:
area_chart = (
    alt.Chart(charges_by_day)
    .mark_area(opacity=0.2)
    .encode(
        x=alt.X(
            "date:T",
            title="",
            axis=alt.Axis(format="%b. %-d", tickCount=10, grid=False),
        ),
        y=alt.Y(
            "charges_list:Q",
            title=" ",
            stack=None,
            axis=alt.Axis(
                tickSize=0,
                domainOpacity=0,
                tickCount=6,
                offset=4,
                gridWidth=0.6,
                gridColor="#dddddd",
            ),
            scale=alt.Scale(domain=(100, 400)),
        ),
    )
)

line_chart = (
    alt.Chart(charges_by_day)
    .mark_line()
    .encode(
        x=alt.X(
            "date:T",
            title="",
            axis=alt.Axis(format="%b. %-d", tickCount=10, grid=False),
        ),
        y=alt.Y(
            "charges_list:Q",
            title=" ",
            stack=None,
            axis=alt.Axis(
                tickSize=0,
                domainOpacity=0,
                tickCount=6,
                offset=4,
                gridWidth=0.6,
                gridColor="#dddddd",
            ),
            scale=alt.Scale(domain=(100, 1600)),
        ),
    )
)


chart_charges_area = (
    (line_chart + area_chart)
    .properties(
        width=800,
        height=600,
        title="U.S. Capitol breach defendants, by day",
    )
    .configure_view(strokeOpacity=0)
    .configure_legend(orient="top", symbolType="square")
    .configure_axis(labelFontSize=13)
)

chart_charges_area

In [40]:
chart_charges_area.save("capitol_charges_timeseries.png", scale_factor=2)

### Group charge categories by day

In [41]:
categories_by_day = (
    df_long.groupby(["date", "category"]).agg({"name": "count"}).reset_index()
)

In [42]:
categories = (
    df_long[df_long["date"] == df_long["date"].max()]
    .groupby(["category"])
    .agg({"name": "count"})
    .reset_index()
).sort_values("name", ascending=False)

### Bar chart of most-recent count of charges by category

In [44]:
bar_categories = (
    alt.Chart(categories)
    .mark_bar()
    .encode(
        x=alt.X(
            "name",
            title="Number of charges",
            axis=alt.Axis(
                domainOpacity=0,
                tickSize=0,
                tickCount=6,
                offset=4,
                gridWidth=0.6,
                gridColor="#dddddd",
            ),
        ),
        y=alt.Y(
            "category",
            title=" ",
            sort="-x",
            axis=alt.Axis(
                tickSize=0,
                domainOpacity=0,
                tickCount=6,
                offset=4,
                gridWidth=0.6,
                gridColor="#dddddd",
                grid=False,
            ),
        ),
    )
)

text = bar_categories.mark_text(
    align="left", baseline="middle", dx=4, fontSize=13
).encode(text="name:Q")


chart_charges_bar = (
    (bar_categories + text)
    .properties(
        width=800,
        height=450,
        title="U.S. Capitol charges, by category",
    )
    .configure_view(strokeOpacity=0)
    .configure_legend(orient="top", symbolType="square")
    .configure_axis(labelFontSize=13)
)

chart_charges_bar

In [45]:
chart_charges_bar.save("capitol_categories_chart.png", scale_factor=2)

### Cumulative area chart with categories over time

In [46]:
categories_by_day.date = pd.to_datetime(categories_by_day.date)

In [47]:
categories_chart = (
    alt.Chart(categories_by_day)
    .mark_area(opacity=0.6)
    .encode(
        x=alt.X(
            "date",
            title="",
            axis=alt.Axis(format="%b. %-d", tickCount=10, grid=False),
        ),
        y=alt.Y(
            "name",
            title=" ",
            axis=alt.Axis(
                tickSize=0,
                domainOpacity=0,
                tickCount=6,
                offset=4,
                gridWidth=0.6,
                gridColor="#dddddd",
            ),
            scale=alt.Scale(domain=(0, 1500)),
        ),
        color=alt.Color("category", scale=alt.Scale(scheme="category20")),
    )
    .properties(
        width=800,
        height=600,
        title="Cumulative charge categories in U.S. Capitol breach, by day",
    )
    .configure_view(strokeOpacity=0)
    .configure_legend(orient="right", symbolType="square")
).configure_axis(labelFontSize=13)

categories_chart

In [48]:
categories_chart.save("capitol_charges_by_category_timeseries.png", scale_factor=2)