# Counting U.S. Capitol breach cases listed by DOJ via Wayback Machine

### Import Python tools and Jupyter configuration

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re

In [3]:
import altair as alt
import altair_latimes as lat
import numpy as np

In [4]:
alt.themes.register("latimes", lat.theme)
alt.themes.enable("latimes")

ThemeRegistry.enable('latimes')

In [5]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
alt.data_transformers.disable_max_rows()
pd.options.display.max_colwidth = None

### Headers

In [6]:
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
}

### Get the wayback machine feed of changes

In [7]:
url = "http://web.archive.org/cdx/search/cdx?url=https://www.justice.gov/usao-dc/capitol-breach-cases&output=json"

In [8]:
urls = pd.read_json(url, orient="records").drop([0], axis=0)

In [9]:
urls.columns = [
    "urlkey",
    "timestamp",
    "original",
    "mimetype",
    "statuscode",
    "digest",
    "length",
]

In [10]:
urls["datetime"] = pd.to_datetime(urls["timestamp"])

In [11]:
urls["date"] = urls["datetime"].dt.date

### Limit the update urls to the last one of each day

In [12]:
daily_urls = urls.sort_values(["datetime", "date"], ascending=True).drop_duplicates(
    "date", keep="last"
)

In [13]:
urls_success = daily_urls[daily_urls["statuscode"] == "200"].copy()

In [14]:
len(urls_success)

60

### Loop through urls and create Wayback create dataframes

In [15]:
# Sample url: 'https://web.archive.org/web/20210401183845/https://www.justice.gov/usao-dc/capitol-breach-cases'

In [16]:
content = []
pages = []

for t, u in zip(urls_success.timestamp, urls_success.original):
    response_justice = requests.get(
        "https://web.archive.org/web/" + t + "/" + u, headers=headers
    )
    pages.append(
        (pd.read_html(response_justice.text, attrs={"class": "tablesaw"})[0]).assign(
            timestamp=t
        )
    )

### Create a large dataframe from a list of update date dataframes, and add a timestamp

In [17]:
df = pd.concat(pages)

### Clean up dates

In [18]:
df["datetime"] = pd.to_datetime(df["timestamp"])

In [19]:
df["date"] = df["datetime"].dt.date

In [20]:
df.drop(["timestamp"], axis=1, inplace=True)

In [21]:
df.dtypes

Case Number                     object
Name                            object
Charge(s)                       object
Associated Documents            object
Location of Arrest              object
Case Status                     object
Entry Last Updated              object
Entry Last Updated*             object
datetime                datetime64[ns]
date                            object
dtype: object

### Clean up the headers

In [22]:
df.columns = (
    df.columns.str.strip()
    .str.lower()
    .str.replace(" ", "_", regex=False)
    .str.replace(":", "", regex=False)
    .str.replace("/", "_", regex=False)
    .str.replace(",", "_", regex=False)
    .str.replace("*", "", regex=False)
    .str.replace("(s)", "s", regex=False)
)

In [23]:
df.to_csv("raw/cases-internet-archive-all.csv", index=False)

---

### How many cases each day?

In [24]:
cases_by_day = df.groupby(["datetime"]).agg({"case_number": "size"}).reset_index()

In [25]:
cases_by_day.dtypes

datetime       datetime64[ns]
case_number             int64
dtype: object

In [33]:
chart = (
    alt.Chart(cases_by_day)
    .mark_line(interpolate="step-before")
    .encode(
        x=alt.X(
            "datetime",
            title="",
            axis=alt.Axis(format="%b. %-d", tickCount=10, grid=False),
        ),
        y=alt.Y(
            "case_number",
            title=" ",
            stack=None,
            axis=alt.Axis(
                tickSize=0,
                domainOpacity=0,
                tickCount=6,
                offset=4,
                gridWidth=0.6,
                gridColor="#dddddd",
            ),
            scale=alt.Scale(domain=(100, 400)),
        ),
    )
    .properties(
        width=600,
        height=335,
        title="U.S. Capitol breach defendants, by day",
    )
    .configure_view(strokeOpacity=0)
    .configure_legend(orient="top", symbolType="square")
)
chart

In [34]:
chart.save("timeseries.png", scale_factor=2)