# White House visitor logs

### Import Python tools and Jupyter configuration

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import datetime as dt
import glob

### Read the page to find links to the raw data

In [3]:
r = requests.get("https://www.whitehouse.gov/disclosures/visitor-logs/")
soup = BeautifulSoup(r.text, "html.parser")

In [4]:
links = [a.get("href") for a in soup.find_all("a", href=re.compile(".csv"))]

In [5]:
for link in links:
    !wget -P 'data/raw/csv' {link} --quiet -N

In [6]:
path = "data/raw/csv/"
all_files = glob.glob(path + "*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

src = pd.concat(li, axis=0, ignore_index=True)

### Clean up the columns

In [7]:
src.columns = src.columns.str.lower()

In [8]:
src = src[src.columns.drop(list(src.filter(regex="unnamed")))]

In [9]:
src.head()

Unnamed: 0,namelast,namefirst,namemid,uin,bdgnbr,access_type,toa,poa,tod,pod,...,terminal_suffix,visitee_namelast,visitee_namefirst,meeting_loc,meeting_room,caller_name_last,caller_name_first,caller_room,description,releasedate
0,SAWYER,LISA,C,U21530,176981.0,VA,3/1/21 0:00,B0401,3/1/2021,B04,...,GW,McEldowney,Nancy,OEOB,206,WOOLFOLK,GARLAND,,,6/29/2021
1,SMITH,JULIANNE,C,U21493,176848.0,VA,3/1/21 7:34,B0401,3/1/2021,B04,...,JD,Lang,Kimberly,WH,WW Room 13,DAVIS,JOVANNA,,,6/29/2021
2,GREENE,JEFFREY,E,U21500,176453.0,VA,3/1/21 8:48,B0401,,,...,JD,Carroll,James,OEOB,311,DAVIS,JOVANNA,,,6/29/2021
3,HENSON,JEFFREY,A,U21443,,VA,,,,,...,LS,Askins,Michael,NEOB,320,SCATLIFFE,LIONEL,,,6/29/2021
4,ROSSETTI,MICHAEL,N,U21494,176356.0,VA,3/1/21 8:42,B0401,3/1/2021,D03,...,AL,Labitzky,Andre,OEOB,21,LABITZKY,ANDRE,,,6/29/2021


### Real dates

In [10]:
date_cols = [
    "appt_made_date",
    "appt_start_date",
    "appt_end_date",
    "appt_cancel_date",
    "releasedate",
]

In [11]:
src[date_cols] = src[date_cols].applymap(lambda x: pd.to_datetime(x))

### The dataframe has inconsitent casing. Let's make it all upper for grouping later. 

In [12]:
src[
    [
        "namelast",
        "namefirst",
        "namemid",
        "uin",
        "access_type",
        "toa",
        "poa",
        "tod",
        "pod",
        "last_updatedby",
        "post",
        "lastentrydate",
        "terminal_suffix",
        "visitee_namelast",
        "visitee_namefirst",
        "meeting_loc",
        "meeting_room",
        "caller_name_last",
        "caller_name_first",
        "description",
    ]
] = src[
    [
        "namelast",
        "namefirst",
        "namemid",
        "uin",
        "access_type",
        "toa",
        "poa",
        "tod",
        "pod",
        "last_updatedby",
        "post",
        "lastentrydate",
        "terminal_suffix",
        "visitee_namelast",
        "visitee_namefirst",
        "meeting_loc",
        "meeting_room",
        "caller_name_last",
        "caller_name_first",
        "description",
    ]
].apply(
    lambda x: x.astype(str).str.upper()
)

In [13]:
df = src.copy()

---

## Exports

In [14]:
today = dt.date.today().strftime("%m-%d-%Y")

In [15]:
df.to_csv("data/processed/log_" + today + ".csv", index=False)
df.to_csv("data/processed/log_latest.csv", index=False)
src.to_csv("data/raw/log_" + today + ".csv", index=False)