# Counting U.S. Capitol breach cases listed by DOJ via Wayback Machine

### Import Python tools and Jupyter configuration

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re

In [3]:
import altair as alt
import altair_latimes as lat
import numpy as np

In [4]:
alt.themes.register("latimes", lat.theme)
alt.themes.enable("latimes")

ThemeRegistry.enable('latimes')

In [5]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
alt.data_transformers.disable_max_rows()
pd.options.display.max_colwidth = None

### Headers

In [6]:
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
}

### Get the wayback machine feed of changes

In [7]:
url = "http://web.archive.org/cdx/search/cdx?url=https://www.justice.gov/usao-dc/capitol-breach-cases&output=json"

In [8]:
urls = pd.read_json(url, orient="records").drop([0], axis=0)

In [9]:
urls.columns = [
    "urlkey",
    "timestamp",
    "original",
    "mimetype",
    "statuscode",
    "digest",
    "length",
]

In [10]:
urls["datetime"] = pd.to_datetime(urls["timestamp"])

In [11]:
urls["date"] = urls["datetime"].dt.date

### Limit the update urls to the last one of each day

In [12]:
daily_urls = urls.sort_values(["datetime", "date"], ascending=True).drop_duplicates(
    "date", keep="last"
)

In [13]:
urls_success = daily_urls[daily_urls["statuscode"] == "200"].copy()

In [14]:
len(urls_success)

114

### Loop through urls and create Wayback create dataframes

In [15]:
# Sample url: 'https://web.archive.org/web/20210401183845/https://www.justice.gov/usao-dc/capitol-breach-cases'

In [16]:
content = []
pages = []

for t, u in zip(urls_success.timestamp, urls_success.original):
    response_justice = requests.get(
        "https://web.archive.org/web/" + t + "/" + u, headers=headers
    )
    #     justice_url = "https://web.archive.org/web/" + t + "/" + u
    pages.append(
        (pd.read_html(response_justice.text, attrs={"class": "tablesaw"})[0]).assign(
            timestamp=t
        )
    )

ConnectionError: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210314031714/https://www.justice.gov/usao-dc/capitol-breach-cases (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x1238453a0>: Failed to establish a new connection: [Errno 60] Operation timed out'))

### Create a large dataframe from a list of update date dataframes, and add a timestamp

In [None]:
df = pd.concat(pages)

In [None]:
df.drop(["Entry Last Updated*"], axis=1, inplace=True)

### Clean up dates

In [None]:
df["datetime"] = pd.to_datetime(df["timestamp"])

In [None]:
df["date"] = df["datetime"].dt.date

In [None]:
df.drop(["timestamp"], axis=1, inplace=True)

In [None]:
df.dtypes

### Clean up the headers

In [None]:
df.columns = (
    df.columns.str.strip()
    .str.lower()
    .str.replace(" ", "_", regex=False)
    .str.replace(":", "", regex=False)
    .str.replace("/", "_", regex=False)
    .str.replace(",", "_", regex=False)
    .str.replace("*", "", regex=False)
    .str.replace("(s)", "s", regex=False)
)

In [None]:
df.to_csv("raw/cases-internet-archive-all.csv", index=False)

---

### How many cases each day?

In [None]:
cases_by_day = df.groupby(["datetime"]).agg({"case_number": "size"}).reset_index()

In [None]:
cases_by_day.dtypes

In [None]:
area_chart = (
    alt.Chart(cases_by_day)
    .mark_area(opacity=0.2)
    .encode(
        x=alt.X(
            "datetime",
            title="",
            axis=alt.Axis(format="%b. %-d", tickCount=3, grid=False),
        ),
        y=alt.Y(
            "case_number",
            title=" ",
            stack=None,
            axis=alt.Axis(
                tickSize=0,
                domainOpacity=0,
                tickCount=6,
                offset=4,
                gridWidth=0.6,
                gridColor="#dddddd",
            ),
            scale=alt.Scale(domain=(100, 500)),
        ),
    )
)

line_chart = (
    alt.Chart(cases_by_day)
    .mark_line()
    .encode(
        x=alt.X(
            "datetime",
            title="",
            axis=alt.Axis(format="%b. %-d", tickCount=3, grid=False),
        ),
        y=alt.Y(
            "case_number",
            title=" ",
            stack=None,
            axis=alt.Axis(
                tickSize=0,
                domainOpacity=0,
                tickCount=6,
                offset=4,
                gridWidth=0.6,
                gridColor="#dddddd",
            ),
            scale=alt.Scale(domain=(100, 500)),
        ),
    )
)


chart_defendants_area = (
    (line_chart + area_chart)
    .properties(
        width=800,
        height=600,
        title="Cumulative U.S. Capitol breach defendants, by day",
    )
    .configure_view(strokeOpacity=0)
    .configure_legend(orient="top", symbolType="square")
    .configure_axis(labelFontSize=13)
)

chart_defendants_area

In [None]:
chart_defendants_area.save("capitol_defendants_timeseries.png", scale_factor=1)

---

In [None]:
df.head()

In [None]:
df.shape

---

### Parse and categorize each defendant's list of charges

In [None]:
df["charges_list"] = df["charges"].str.upper().str.split(";")

### Create a flat table from the one-to-many relationship between defendants and charges

In [None]:
df_long = df.explode("charges_list")

In [None]:
charges_by_day = df_long.groupby(["date"]).agg({"charges_list": "count"}).reset_index()

### Charge strings and categories

In [None]:
data_list = {
    "charge": [
        "INTERSTATE",
        "VIOLENCE",
        "PARADING, DEMONSTRATING, OR PICKETING",
        "PARADE, DEMONSTRATE, OR PICKET",
        "ENTERING",
        "DISORDERLY CONDUCT",
        "DISORDERLY AND DISRUPTIVE CONDUCT",
        "OBSTRUCTION OF AN OFFICIAL",
        "CIVIL DISORDER",
        "OBSTRUCTION OF AN OFFICIAL",
        "KNOWINGLY ENTERING OR REMAINING",
        "VIOLENT ENTRY",
        "DISORDERLY AND DISRUPTIVE CONDUCT",
        "REMAINING IN A RESTRICTED BUILDING",
        "VIOLENT ENTRY",
        "AIDING AND ABETTING",
        "PHYSICAL VIOLENCE",
        "CONSPIRACY",
        "ASSAULTING, RESISTING, OR IMPEDING",
        "RESTRICTED BUILDING OR GROUNDS",
        "ENTERING AND REMAINING IN A RESTRICTED BUILDING OR GROUNDS WITH A DEADLY OR DANGEROUS WEAPON",
        "ASSAULTING, RESISTING, OR IMPEDING CERTAIN OFFICERS USING A DANGEROUS WEAPON",
        "IMPEDING PASSAGE THROUGH THE CAPITOL GROUNDS OR BUILDINGS",
        "TAMPERING",
        "THEFT",
        "OBSTRUCTION",
        "DESTRUCTION",
        "DEMONSTRATING",
        "DISRUPTIVE CONDUCT",
        "PROPERTY",
        "ASSAULTING",
        "UNLAWFUL ENTRY",
        "IMPEDING",
        "PISTOL",
        "FIREARM",
        "OBSTRUCTING",
        "UNLAWFUL ACTIVITIES ON CAPITOL GROUNDS",
        "INTERFERED WITH A FEDERAL AGENT",
        "UNLAWFUL POSSESSION",
        "BODILY INJURY",
        "THREATENING A FEDERAL OFFICER",
        "ASSAULT",
        "PARADING",
        "DISORDERLLY CONDUCT",
        "DI$ORDERLY CONDUCT",
        "OBSTRUCT, IMPEDE, OR INTERFERE",
        "DISRUPTING THE ORDERLY CONDUCT",
        "AMMUNITION",
        "DI$ORDERLY",
        "DISORDELRY CONDUCT",
        "OFFICIAL PROCEEDING",
        "AIDING AND AIDING",
    ],
    "category": [
        "Other",
        "Violent entry",
        "Demonstrating in the Capitol",
        "Demonstrating in the Capitol",
        "Entering restricted area",
        "Disruptive/disorderly",
        "Disruptive/disorderly",
        "Obstructing a proceeding",
        "Disruptive/disorderly",
        "Obstructing a proceeding",
        "Entering restricted area",
        "Violent entry",
        "Disruptive/disorderly",
        "Entering restricted area",
        "Violent entry",
        "Aiding and abetting",
        "Physical violence",
        "Conspiracy",
        "Assaulting/resisting/impeding",
        "Entering restricted area",
        "Weapons charge",
        "Assault/resist/impede officer",
        "Impeding passage in the Capitol",
        "Tampering",
        "Theft",
        "Obstructing a proceeding",
        "Destroying property",
        "Demonstrating in the Capitol",
        "Disruptive/disorderly",
        "Property damage",
        "Assault/resist/impede officer",
        "Entering restricted area",
        "Assault/resist/impede officer",
        "Weapons charge",
        "Weapons charge",
        "Obstructing a proceeding",
        "Unlawful activities",
        "Assault/resist/impede officer",
        "Weapons charge",
        "Assault/resist/impede officer",
        "Assault/resist/impede officer",
        "Assault/resist/impede officer",
        "Demonstrating in the Capitol",
        "Disruptive/disorderly",
        "Disruptive/disorderly",
        "Assault/resist/impede officer",
        "Disruptive/disorderly",
        "Weapons charge",
        "Disruptive/disorderly",
        "Disruptive/disorderly",
        "Obstructing a proceeding",
        "Aiding and abetting",
    ],
}

### Loop and assign categories

In [None]:
for charge, category in zip(data_list["charge"], data_list["category"]):
    df_long.loc[
        df_long["charges_list"].fillna("").str.contains(f"{charge}", case=False),
        "category",
    ] = f"{category}"

In [None]:
df_long.head()

--- 

### Chart the number of charges over time

In [None]:
charges_by_day.date = pd.to_datetime(charges_by_day.date)

In [None]:
charges_by_day.dtypes

In [None]:
area_chart = (
    alt.Chart(charges_by_day)
    .mark_area(opacity=0.2)
    .encode(
        x=alt.X(
            "date:T",
            title="",
            axis=alt.Axis(format="%b. %-d", tickCount=6, grid=False),
        ),
        y=alt.Y(
            "charges_list:Q",
            title=" ",
            stack=None,
            axis=alt.Axis(
                tickSize=0,
                domainOpacity=0,
                tickCount=6,
                offset=4,
                gridWidth=0.6,
                gridColor="#dddddd",
            ),
            scale=alt.Scale(domain=(100, 1800)),
        ),
    )
)

line_chart = (
    alt.Chart(charges_by_day)
    .mark_line()
    .encode(
        x=alt.X(
            "date:T",
            title="",
            axis=alt.Axis(format="%b. %-d", tickCount=6, grid=False),
        ),
        y=alt.Y(
            "charges_list:Q",
            title=" ",
            stack=None,
            axis=alt.Axis(
                tickSize=0,
                domainOpacity=0,
                tickCount=6,
                offset=4,
                gridWidth=0.6,
                gridColor="#dddddd",
            ),
            scale=alt.Scale(domain=(100, 1800)),
        ),
    )
)


chart_charges_area = (
    (line_chart + area_chart)
    .properties(
        width=800,
        height=600,
        title="Cumulative U.S. Capitol breach charges, by day",
    )
    .configure_view(strokeOpacity=0)
    .configure_legend(orient="top", symbolType="square")
    .configure_axis(labelFontSize=13)
)

chart_charges_area

In [None]:
chart_charges_area.save("capitol_charges_timeseries.png", scale_factor=1)

### Group charge categories by day

In [None]:
categories_by_day = (
    df_long.groupby(["date", "category"]).agg({"name": "count"}).reset_index()
)

In [None]:
categories = (
    df_long[df_long["date"] == df_long["date"].max()]
    .groupby(["category"])
    .agg({"name": "count"})
    .reset_index()
).sort_values("name", ascending=False)

### Bar chart of most-recent count of charges by category

In [None]:
bar_categories = (
    alt.Chart(categories)
    .mark_bar()
    .encode(
        x=alt.X(
            "name",
            title="Number of charges",
            axis=alt.Axis(
                domainOpacity=0,
                tickSize=0,
                tickCount=6,
                offset=4,
                gridWidth=0.6,
                gridColor="#dddddd",
            ),
        ),
        y=alt.Y(
            "category",
            title=" ",
            sort="-x",
            axis=alt.Axis(
                tickSize=0,
                domainOpacity=0,
                tickCount=6,
                offset=4,
                gridWidth=0.6,
                gridColor="#dddddd",
                grid=False,
            ),
        ),
    )
)

text = bar_categories.mark_text(
    align="left", baseline="middle", dx=4, fontSize=13
).encode(text="name:Q")


chart_charges_bar = (
    (bar_categories + text)
    .properties(
        width=800,
        height=450,
        title="U.S. Capitol charges, by category",
    )
    .configure_view(strokeOpacity=0)
    .configure_legend(orient="top", symbolType="square")
    .configure_axis(labelFontSize=13)
)

chart_charges_bar

In [None]:
chart_charges_bar.save("capitol_categories_chart.png", scale_factor=1)

### Cumulative area chart with categories over time

In [None]:
categories_by_day.date = pd.to_datetime(categories_by_day.date)

In [None]:
categories_chart = (
    alt.Chart(categories_by_day)
    .mark_area(opacity=0.6)
    .encode(
        x=alt.X(
            "date",
            title="",
            axis=alt.Axis(format="%b. %-d", tickCount=10, grid=False),
        ),
        y=alt.Y(
            "name",
            title=" ",
            axis=alt.Axis(
                tickSize=0,
                domainOpacity=0,
                tickCount=6,
                offset=4,
                gridWidth=0.6,
                gridColor="#dddddd",
            ),
            scale=alt.Scale(domain=(0, 1800)),
        ),
        color=alt.Color("category", scale=alt.Scale(scheme="category20")),
    )
    .properties(
        width=800,
        height=600,
        title="Cumulative charge categories in U.S. Capitol breach, by day",
    )
    .configure_view(strokeOpacity=0)
    .configure_legend(orient="right", symbolType="square")
).configure_axis(labelFontSize=13)

categories_chart

In [None]:
categories_chart.save("capitol_charges_by_category_timeseries.png", scale_factor=1)