# White House visitor logs

### Import Python tools and Jupyter configuration

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import datetime as dt

In [3]:
import altair as alt
import altair_latimes as lat
import numpy as np

In [4]:
alt.themes.register("latimes", lat.theme)
alt.themes.enable("latimes")

ThemeRegistry.enable('latimes')

In [5]:
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000
alt.data_transformers.disable_max_rows()
pd.options.display.max_colwidth = None

In [6]:
# https://www.whitehouse.gov/disclosures/visitor-logs/

### Read the page to find links to the raw data

In [7]:
r = requests.get("https://www.whitehouse.gov/disclosures/visitor-logs/")
soup = BeautifulSoup(r.text, "html.parser")

### Grab the urls from download buttons

In [8]:
page = soup.find_all("a", class_="wp-block-button__link")

In [9]:
links = []

for p in page:
    links.append(dict(url=p["href"]))

### Get the first URL

In [10]:
src = pd.read_csv(links[0]["url"])

### Clean up the messy file

In [11]:
src.columns = src.columns.str.lower()

In [12]:
src.drop(
    [
        "unnamed: 28",
        "unnamed: 29",
        "unnamed: 30",
        "unnamed: 31",
        "unnamed: 32",
        "unnamed: 33",
        "unnamed: 34",
        "unnamed: 35",
        "unnamed: 36",
        "unnamed: 37",
        "unnamed: 38",
        "unnamed: 39",
        "unnamed: 40",
        "unnamed: 41",
        "unnamed: 42",
        "unnamed: 43",
        "unnamed: 44",
        "unnamed: 45",
        "unnamed: 46",
        "unnamed: 47",
        "unnamed: 48",
        "unnamed: 49",
        "unnamed: 50",
        "unnamed: 51",
        "unnamed: 52",
        "unnamed: 53",
        "unnamed: 54",
        "unnamed: 55",
        "unnamed: 56",
        "unnamed: 57",
        "unnamed: 58",
        "unnamed: 59",
        "unnamed: 60",
        "unnamed: 61",
        "unnamed: 62",
        "unnamed: 63",
        "unnamed: 64",
        "unnamed: 65",
        "unnamed: 66",
        "unnamed: 67",
        "unnamed: 68",
        "unnamed: 69",
        "unnamed: 70",
        "unnamed: 71",
        "unnamed: 72",
        "unnamed: 73",
        "unnamed: 74",
        "unnamed: 75",
        "unnamed: 76",
        "unnamed: 77",
        "unnamed: 78",
        "unnamed: 79",
        "unnamed: 80",
        "unnamed: 81",
        "unnamed: 82",
        "unnamed: 83",
        "unnamed: 84",
        "unnamed: 85",
        "unnamed: 86",
        "unnamed: 87",
        "unnamed: 88",
        "unnamed: 89",
        "unnamed: 90",
        "unnamed: 91",
        "unnamed: 92",
        "unnamed: 93",
        "unnamed: 94",
        "unnamed: 95",
        "unnamed: 96",
        "unnamed: 97",
        "unnamed: 98",
        "unnamed: 99",
        "unnamed: 100",
        "unnamed: 101",
        "unnamed: 102",
        "unnamed: 103",
        "unnamed: 104",
        "unnamed: 105",
        "unnamed: 106",
        "unnamed: 107",
        "unnamed: 108",
        "unnamed: 109",
        "unnamed: 110",
        "unnamed: 111",
        "unnamed: 112",
        "unnamed: 113",
        "unnamed: 114",
        "unnamed: 115",
        "unnamed: 116",
        "unnamed: 117",
        "unnamed: 118",
        "unnamed: 119",
        "unnamed: 120",
        "unnamed: 121",
        "unnamed: 122",
        "unnamed: 123",
        "unnamed: 124",
        "unnamed: 125",
        "unnamed: 126",
        "unnamed: 127",
        "unnamed: 128",
        "unnamed: 129",
        "unnamed: 130",
        "unnamed: 131",
        "unnamed: 132",
        "unnamed: 133",
        "unnamed: 134",
        "unnamed: 135",
        "unnamed: 136",
        "unnamed: 137",
        "unnamed: 138",
        "unnamed: 139",
        "unnamed: 140",
        "unnamed: 141",
        "unnamed: 142",
        "unnamed: 143",
        "unnamed: 144",
        "unnamed: 145",
        "unnamed: 146",
    ],
    axis=1,
    inplace=True,
)

### Real dates

In [14]:
date_cols = [
    "appt_made_date",
    "appt_start_date",
    "appt_end_date",
    "appt_cancel_date",
    "releasedate",
]

In [15]:
src[date_cols] = src[date_cols].applymap(lambda x: pd.to_datetime(x))

### The dataframe has inconsitent casing. Let's make it all upper for grouping later. 

In [16]:
src[
    [
        "namelast",
        "namefirst",
        "namemid",
        "uin",
        "access_type",
        "toa",
        "poa",
        "tod",
        "pod",
        "last_updatedby",
        "post",
        "lastentrydate",
        "terminal_suffix",
        "visitee_namelast",
        "visitee_namefirst",
        "meeting_loc",
        "meeting_room",
        "caller_name_last",
        "caller_name_first",
        "description",
    ]
] = src[
    [
        "namelast",
        "namefirst",
        "namemid",
        "uin",
        "access_type",
        "toa",
        "poa",
        "tod",
        "pod",
        "last_updatedby",
        "post",
        "lastentrydate",
        "terminal_suffix",
        "visitee_namelast",
        "visitee_namefirst",
        "meeting_loc",
        "meeting_room",
        "caller_name_last",
        "caller_name_first",
        "description",
    ]
].apply(
    lambda x: x.astype(str).str.upper()
)

In [17]:
df = src.copy()

---

## Exports

In [18]:
today = dt.date.today().strftime("%m-%d-%Y")

In [20]:
df.to_csv("data/processed/log_" + today + ".csv", index=False)
df.to_csv("data/processed/log_latest.csv", index=False)
src.to_csv("data/raw/log_" + today + ".csv", index=False)