# Parsing VP Kamala Harris' schedules from White House emails

In [123]:
%load_ext lab_black

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


In [124]:
import pandas as pd
import altair as alt
import altair_latimes as lat
import glob
import os
import re

In [125]:
alt.themes.register("latimes", lat.theme)
alt.themes.enable("latimes")
pd.options.display.max_columns = 50
pd.options.display.max_rows = 1000
alt.data_transformers.disable_max_rows()
pd.set_option("max_colwidth", None)

### Grab all the email files

In [126]:
path = "schedules"
email_files = glob.glob(os.path.join(path, "*.eml"))

### Loop over the list of files

In [159]:
emails = []

for f in email_files:
    emails.append(pd.read_html(f, header=None))

### Grab only the body of the message

In [189]:
email_texts = []

for t in emails:
    s = str(t)
    result = re.search("FOR IMMEDIATE RELEASE (.*)###", s)
    email_texts.append(result.group(1))

### Read that raw text into a frame we can parse

In [190]:
df = pd.DataFrame(email_texts)

In [191]:
df.rename(columns={0: "email_text"}, inplace=True)

In [192]:
df["text_to_parse"] = df["email_text"]

### Does the travel involve travel or the VP's husband?

In [193]:
df["involve_travel"] = df.email_text.str.contains("will travel")

In [194]:
df["involve_los_angeles"] = df.email_text.str.contains("Angeles")

In [195]:
df["involve_oakland"] = df.email_text.str.contains("Oakland")

In [196]:
df["involve_california"] = df.email_text.str.contains("California")

In [197]:
df["involve_second_gentleman"] = df.email_text.str.contains("SECOND GENTLEMAN")

### Parsing characters and phrases for splitting the narrative into event-related columns

In [198]:
df.text_to_parse = (
    df.text_to_parse.str.replace("PRES= IDENT", "PRESIDENT", regex=False)
    .str.replace("</=", "", regex=False)
    .str.replace("= span>", "", regex=False)
    .str.replace("=E2=80=99", "'", regex=False)
    .str.replace("= ", "", regex=False)
    .str.replace("=92", "'", regex=False)
    .str.replace("=C3=A1", "á", regex=False)
    .str.replace(
        "DAILY GUIDANCE FOR THE VICE PRESIDENT AND SECOND GENTLEMAN", "|", regex=False
    )
    .str.replace("DAILY GUIDANCE FOR THE SECOND GENTLEMAN", "|", regex=False)
    .str.replace("DAILY GUIDANCE FOR THE SECOND GENTLEMAN ", "|", regex=False)
    .str.replace("DAILY GUIDANCE FOR THE VICE PRESIDENT", "|", regex=False)
    .str.replace(" At ", "|", regex=False)
    .str.replace(" On ", "|", regex=False)
    .str.replace(", 2021 ", ", 2021|", regex=False)
    .str.replace("| |", "|", regex=False)
    .str.replace("||", "|", regex=False)
)

### Split the events paragraph into columns

In [199]:
df[
    [
        "email_date",
        "events_date",
        "event1",
        "event2",
        "event3",
        "event4",
        "event5",
        "event6",
        "event7",
    ]
] = df.text_to_parse.str.split("|", expand=True)

### How many of the event columns actually have events?

In [200]:
cols = ["event1", "event2", "event3", "event4", "event5", "event6", "event7"]

In [201]:
df["daily_events_count"] = df[cols].count(axis=1)

### Cleaning up dates

In [202]:
df["events_date"] = (
    df["events_date"]
    .str.replace("FOR", "", regex=False)
    .str.strip("")
    .str.title()
    .str.replace(", 2021", "", regex=False)
)

In [203]:
df.head(1)

Unnamed: 0,email_text,text_to_parse,involve_travel,involve_los_angeles,involve_oakland,involve_california,involve_second_gentleman,email_date,events_date,event1,event2,event3,event4,event5,event6,event7,daily_events_count
0,"April 13, 2021 DAILY GUIDANCE FOR THE VICE PRESIDENT FOR WEDNESDAY, APRIL 14 At 10:00AM EDT, the Vice President will convene a virtual roundtable of exp= erts on the Northern Triangle who will offer their assessment and perspecti= ves on the region. There will be a pool spray at the top of this meeting in= the Vice President=E2=80=99s Ceremonial Office.","April 13, 2021| FOR WEDNESDAY, APRIL 14 |10:00AM EDT, the Vice President will convene a virtual roundtable of experts on the Northern Triangle who will offer their assessment and perspectives on the region. There will be a pool spray at the top of this meeting in the Vice President's Ceremonial Office.",False,False,False,False,False,"April 13, 2021","Wednesday, April 14","10:00AM EDT, the Vice President will convene a virtual roundtable of experts on the Northern Triangle who will offer their assessment and perspectives on the region. There will be a pool spray at the top of this meeting in the Vice President's Ceremonial Office.",,,,,,,1


In [204]:
df[["day_of_week", "events_date"]] = df["events_date"].str.split(", ", expand=True)

ValueError: Columns must be same length as key

In [205]:
df["day_of_week"] = df["day_of_week"].str.strip("")

KeyError: 'day_of_week'

In [None]:
df["email_date"] = pd.to_datetime(df["email_date"])

In [None]:
df["events_date"] = pd.to_datetime(df["events_date"] + str(", 2021"))

In [None]:
len(df)

In [None]:
df = df.drop_duplicates()

In [None]:
df.sort_values("email_date", ascending=False).head(1)

---

### How many events involve travel? 

In [None]:
travel = df[df["involve_travel"] == True].copy()

In [None]:
len(travel)

### Rudementary place parser

In [None]:
df["place_travel"] = df["text_to_parse"].str.extract("will travel to ([^.|,]*)")

---

### Get our dataframe in order

In [None]:
df = df[
    [
        "email_date",
        "events_date",
        "daily_events_count",
        "day_of_week",
        "involve_travel",
        "involve_california",
        "involve_los_angeles",
        "involve_oakland",
        "place_travel",
        "involve_second_gentleman",
        "event1",
        "event2",
        "event3",
        "event4",
        "event5",
        "event6",
        "event7",
        "email_text",
        "text_to_parse",
    ]
]

---

## Toplines

In [None]:
california = len(df[df["involve_california"] == True])

In [None]:
la = len(df[df["involve_los_angeles"] == True])

In [None]:
oakland = len(df[df["involve_oakland"] == True])

In [None]:
travel = len(df[df["involve_travel"] == True])

In [None]:
print(
    "\n\nVice President Harris has had at least "
    + str(travel)
    + " events related to travel on her public schedule released by the White House. \n\nAt least "
    + str(la)
    + " have involved Los Angeles. At least "
    + str(la)
    + " have involved Oakland."
)

---

## Charts

### Does she work weekends? 

In [None]:
bar_chart = (
    alt.Chart(df)
    .mark_bar(opacity=1)
    .encode(
        x=alt.X(
            "events_date",
            title="",
            axis=alt.Axis(format="%b. %-d", tickCount=5, grid=False),
        ),
        y=alt.Y(
            "daily_events_count",
            title=" ",
            stack=None,
            axis=alt.Axis(
                tickSize=0,
                domainOpacity=0,
                tickCount=6,
                offset=4,
                gridWidth=0.6,
                gridColor="#dddddd",
            ),
            scale=alt.Scale(domain=(0, 8)),
        ),
    )
)


chart_kamala_events = (
    (bar_chart)
    .properties(
        width=600,
        height=400,
        title="VP Kamala Harris events per day",
    )
    .configure_view(strokeOpacity=0)
    .configure_legend(orient="top", symbolType="square")
    .configure_axis(labelFontSize=13)
)

chart_kamala_events

---

## Exports

In [None]:
chart_kamala_events.save("chart_kamala_events.png", scale_factor=1)

In [None]:
df.sort_values("email_date").to_csv("data/processed/all_events.csv", index=False)

In [None]:
df["email_text"].to_csv("data/raw/email_texts.csv", index=False)