# White House visitor logs

### Import Python tools and Jupyter configuration

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import datetime as dt
import glob

### Read the page to find links to the raw data

In [3]:
r = requests.get("https://www.whitehouse.gov/disclosures/visitor-logs/")
soup = BeautifulSoup(r.text, "html.parser")

In [4]:
links = [a.get("href") for a in soup.find_all("a", href=re.compile(".csv"))]

In [5]:
for link in links:
    !wget -P 'data/raw/csv' {link} --quiet -N

In [6]:
path = "data/raw/csv/"
all_files = glob.glob(path + "*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

src = pd.concat(li, axis=0, ignore_index=True)

### Clean up the columns

In [7]:
src.columns = src.columns.str.lower()

In [8]:
src = src[src.columns.drop(list(src.filter(regex="unnamed")))]

In [9]:
src = src.sort_values("releasedate", ascending=False)

In [10]:
src.head()

Unnamed: 0,namelast,namefirst,namemid,uin,bdgnbr,access_type,toa,poa,tod,pod,...,terminal_suffix,visitee_namelast,visitee_namefirst,meeting_loc,meeting_room,caller_name_last,caller_name_first,caller_room,description,releasedate
15183,POWELL,JEREMY,W,U25295,181014.0,VA,5/27/2021 8:24,K0101,,,...,CC,Cooper,Charles,NEOB,3202,COOPER,CHARLES,,,8/30/2021
14710,SAMSON,WILLIAM,E,U25068,,VA,,,,,...,ET,Teleky,Ed,WH,EW 206,TELEKY,EDWARD,,,8/30/2021
14723,WILCOXSON,MARTHA,L,U25238,,VA,,,,,...,DV,,POTUS,WH,EW206,VIA,DANIEL,,,8/30/2021
14722,WILCOXSON,ANTHONY,L,U25238,,VA,,,,,...,DV,,POTUS,WH,EW206,VIA,DANIEL,,,8/30/2021
14721,WHITMAN,RALPH,E,U25238,,VA,,,,,...,DV,,POTUS,WH,EW206,VIA,DANIEL,,,8/30/2021


### Real dates

In [11]:
date_cols = [
    "appt_made_date",
    "appt_start_date",
    "appt_end_date",
    "appt_cancel_date",
    "releasedate",
]

In [12]:
src[date_cols] = src[date_cols].applymap(lambda x: pd.to_datetime(x))

### The dataframe has inconsitent casing. Let's make it all upper for grouping later. 

In [13]:
src[
    [
        "namelast",
        "namefirst",
        "namemid",
        "uin",
        "access_type",
        "toa",
        "poa",
        "tod",
        "pod",
        "last_updatedby",
        "post",
        "lastentrydate",
        "terminal_suffix",
        "visitee_namelast",
        "visitee_namefirst",
        "meeting_loc",
        "meeting_room",
        "caller_name_last",
        "caller_name_first",
        "description",
    ]
] = src[
    [
        "namelast",
        "namefirst",
        "namemid",
        "uin",
        "access_type",
        "toa",
        "poa",
        "tod",
        "pod",
        "last_updatedby",
        "post",
        "lastentrydate",
        "terminal_suffix",
        "visitee_namelast",
        "visitee_namefirst",
        "meeting_loc",
        "meeting_room",
        "caller_name_last",
        "caller_name_first",
        "description",
    ]
].apply(
    lambda x: x.astype(str).str.upper()
)

In [14]:
df = src.copy()

---

## Exports

In [15]:
today = dt.date.today().strftime("%m-%d-%Y")

In [16]:
df.to_csv("data/processed/log_" + today + ".csv", index=False)
df.to_csv("data/processed/log_latest.csv", index=False)
src.to_csv("data/raw/log_" + today + ".csv", index=False)