# President Biden polls

### Get Python tools

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import altair as alt
import glob
import os
import numpy as np
import requests
from bs4 import BeautifulSoup

In [3]:
pd.options.display.max_columns = 50
pd.options.display.max_rows = 1000
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

---

### Get latest Biden trends from Real Clear Politics

In [4]:
url = "https://www.realclearpolitics.com/epolls/other/president-biden-job-approval-7320.html#polls"
headers = {"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X "}
r = requests.get(url, headers=headers)

In [5]:
soup = BeautifulSoup(r.text, "html.parser")

In [6]:
tables = soup.findAll("table", attrs={"class": "data"})

In [7]:
src = pd.read_html(str(tables))[0]

In [8]:
src[:1]

Unnamed: 0,Poll,Date,Sample,Approve,Disapprove,Spread
0,RCP Average,12/1 - 12/21,--,43.5,52.6,-9.1


---

In [9]:
wayback = "https://web.archive.org/cdx/search/cdx?url=https://www.realclearpolitics.com/epolls/other/president-biden-job-approval-7320.html&output=json"

In [10]:
urls = pd.read_json(wayback, orient="records").drop([0], axis=0)

In [11]:
urls.columns = [
    "urlkey",
    "timestamp",
    "original",
    "mimetype",
    "statuscode",
    "digest",
    "length",
]

In [12]:
urls["datetime"] = pd.to_datetime(urls["timestamp"])

In [13]:
urls["date"] = urls["datetime"].dt.date

### Limit the update urls to the last one of each day

In [14]:
daily_urls = urls.sort_values(["datetime", "date"], ascending=True).drop_duplicates(
    "date", keep="last"
)

In [15]:
urls_success = daily_urls[daily_urls["statuscode"] == "200"].copy()

In [16]:
len(urls_success)

217

### Loop through urls and create Wayback dataframes

In [17]:
content = []
pages = []

for t, u in zip(urls_success.timestamp, urls_success.original):
    response_polls = requests.get(
        "https://web.archive.org/web/" + t + "/" + u, headers=headers
    )
    pages.append(
        (pd.read_html(response_polls.text, attrs={"class": "data"})[0]).assign(
            timestamp=t
        )
    )

### Create a large dataframe from a list of update date dataframes, and add a timestamp

In [18]:
src = pd.concat(pages)

In [19]:
src.columns = src.columns.str.lower()

In [20]:
df = src[src["poll"].str.contains("RCP")].reset_index(drop=True).copy()

In [21]:
df.head()

Unnamed: 0,poll,date,sample,approve,disapprove,spread,timestamp
0,RCP Average,1/20 - 1/26,--,54.8,36.8,18.0,20210127222301
1,RCP Average,1/20 - 1/27,--,55.7,36.0,19.7,20210129171634
2,RCP Average,1/20 - 1/28,--,55.8,35.5,20.3,20210130212314
3,RCP Average,1/20 - 1/31,--,54.9,35.7,19.2,20210201185557
4,RCP Average,1/20 - 2/2,--,54.2,36.1,18.1,20210203201902


In [22]:
df["wayback_date"] = (
    pd.to_datetime(df["timestamp"])
    .dt.tz_localize("GMT")
    .dt.tz_convert("US/Eastern")
    .dt.date
)
df["wayback_time"] = (
    pd.to_datetime(df["timestamp"])
    .dt.tz_localize("GMT")
    .dt.tz_convert("US/Eastern")
    .dt.time
)

In [23]:
df["spread"] = df["approve"] - df["disapprove"]

In [24]:
df.dtypes

poll             object
date             object
sample           object
approve         float64
disapprove      float64
spread          float64
timestamp        object
wayback_date     object
wayback_time     object
dtype: object

In [25]:
df[["begin", "end"]] = df["date"].astype(str).str.split(" - ", n=1, expand=True)

In [26]:
df.drop(["poll", "date", "sample", "timestamp"], axis=1, inplace=True)

In [27]:
historic_df = df[
    ["wayback_date", "wayback_time", "approve", "disapprove", "spread"]
].copy()

In [28]:
historic_df.head()

Unnamed: 0,wayback_date,wayback_time,approve,disapprove,spread
0,2021-01-27,17:23:01,54.8,36.8,18.0
1,2021-01-29,12:16:34,55.7,36.0,19.7
2,2021-01-30,16:23:14,55.8,35.5,20.3
3,2021-02-01,13:55:57,54.9,35.7,19.2
4,2021-02-03,15:19:02,54.2,36.1,18.1


In [29]:
historic_df.tail()

Unnamed: 0,wayback_date,wayback_time,approve,disapprove,spread
209,2021-12-18,05:02:13,44.1,50.3,-6.2
210,2021-12-19,17:49:08,44.1,50.3,-6.2
211,2021-12-20,16:36:32,44.1,52.1,-8.0
212,2021-12-21,14:37:34,44.1,52.0,-7.9
213,2021-12-22,03:32:30,44.1,52.0,-7.9


In [30]:
historic_df["date"] = pd.to_datetime(historic_df["wayback_date"])

In [31]:
historic_df.dtypes

wayback_date            object
wayback_time            object
approve                float64
disapprove             float64
spread                 float64
date            datetime64[ns]
dtype: object

In [34]:
historic_df.to_csv("data/processed/biden_history.csv", index=False)