# RealClearPolitics: 2022 Senate polls

#### Import Python tools

In [1]:
%load_ext lab_black

In [4]:
import pandas as pd
import geopandas as gpd
import altair as alt
import altair_stiles as altstiles
import numpy as np
import us
import urllib.request, json
import glob
import os
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import os
import time

In [5]:
alt.themes.register("stiles", altstiles.theme)
alt.themes.enable("stiles")

ThemeRegistry.enable('stiles')

In [6]:
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000
alt.data_transformers.disable_max_rows()
pd.set_option("display.max_colwidth", None)

In [7]:
today = pd.to_datetime("today").strftime("%Y-%m-%d")

In [8]:
today

'2022-11-09'

---

## Harvest data 

#### First, get all the poll page urls from the 2022 landing page

In [9]:
url = "https://www.realclearpolitics.com/epolls/latest_polls/senate/"

In [10]:
path = "/Users/stiles/github/chromedriver"
s = Service(path)
driver = webdriver.Chrome(service=s)

In [11]:
url = "https://www.realclearpolitics.com/epolls/latest_polls/senate/"
driver.get(url)
html = driver.page_source

In [12]:
soup = BeautifulSoup(html, "html.parser")
links = soup.findAll("a")

In [13]:
data = soup.findAll("td", attrs={"class": "lp-race"})

In [14]:
links = []

for td in data:
    page_links = td.findAll("a")
    for a in page_links:
        links.append("https://www.realclearpolitics.com" + a["href"])

In [15]:
links = list(set(links))

In [16]:
path = "/Users/stiles/github/chromedriver"
s = Service(path)
page_driver = webdriver.Chrome(service=s)

In [17]:
dfs = []

for page_link in links:
    page_driver.get(page_link)
    page_html = page_driver.page_source
    page_soup = BeautifulSoup(page_html, "html.parser")
    dfs.append((pd.read_html(page_driver.page_source)[0]).assign(race=page_link))

In [18]:
df = pd.concat(dfs)

In [19]:
df["race"] = (
    df["race"]
    .str.replace(
        "https://www.realclearpolitics.com/epolls/2022/senate/", "", regex=False
    )
    .str.replace(".html", "", regex=False)
)
df.drop(["Sample", "MoE"], axis=1, inplace=True)

In [20]:
src = df[df["Poll"].str.contains("RCP")]

In [22]:
src.head()

In [23]:
src.columns

Index(['Poll', 'Date', 'Britt (R)', 'Boyd (D)', 'Spread', 'race', 'Lee (R) *',
       'McMullin (I)', 'Boozman (R) *', 'James (D)', 'Wyden (D) *',
       'Perkins (R)', 'Vance (R)', 'Ryan (D)', 'Mullin (R)', 'Horn (D)',
       'Schmitt (R)', 'Valentine (D)', 'Thune (R) *', 'Bengs (D)',
       'Blumenthal (D) *', 'Levy (R)', 'Padilla (D) *', 'Meuser (R)',
       'Masters (R)', 'Kelly (D) *', 'Grassley (R) *', 'Franken (D)',
       'Welch (D)', 'Malloy (R)', 'Moran (R) *', 'Holland (D)',
       'Lankford (R) *', 'Laxalt (R)', 'Cortez Masto (D) *', 'Johnson (R) *',
       'Barnes (D)', 'Hassan (D) *', 'Bolduc (R)', 'Oz (R)', 'Fetterman (D)',
       'Budd (R)', 'Beasley (D)', 'Murray (D) *', 'Smiley (R)',
       'Kennedy (R) *', 'Mixon (D)', 'Chambers (D)', 'Steib (D)',
       'Rubio (R) *', 'Demings (D)', 'Walker (R)', 'Warnock (D) *',
       'Duckworth (D) *', 'Salvi (R)', 'Bennet (D) *', 'O'Dea (R)',
       'Schumer (D) *', 'Pinion (R)'],
      dtype='object')

In [25]:
src_melted = src.melt(
    value_vars=[
        "Britt (R)",
        "Boyd (D)",
        "Lee (R) *",
        "McMullin (I)",
        "Boozman (R) *",
        "James (D)",
        "Wyden (D) *",
        "Perkins (R)",
        "Vance (R)",
        "Ryan (D)",
        "Mullin (R)",
        "Horn (D)",
        "Schmitt (R)",
        "Valentine (D)",
        "Thune (R) *",
        "Bengs (D)",
        "Blumenthal (D) *",
        "Levy (R)",
        "Padilla (D) *",
        "Meuser (R)",
        "Masters (R)",
        "Kelly (D) *",
        "Grassley (R) *",
        "Franken (D)",
        "Welch (D)",
        "Malloy (R)",
        "Moran (R) *",
        "Holland (D)",
        "Lankford (R) *",
        "Laxalt (R)",
        "Cortez Masto (D) *",
        "Johnson (R) *",
        "Barnes (D)",
        "Hassan (D) *",
        "Bolduc (R)",
        "Oz (R)",
        "Fetterman (D)",
        "Budd (R)",
        "Beasley (D)",
        "Murray (D) *",
        "Smiley (R)",
        "Kennedy (R) *",
        "Mixon (D)",
        "Chambers (D)",
        "Steib (D)",
        "Rubio (R) *",
        "Demings (D)",
        "Walker (R)",
        "Warnock (D) *",
        "Duckworth (D) *",
        "Salvi (R)",
        "Bennet (D) *",
        "O'Dea (R)",
        "Schumer (D) *",
        "Pinion (R)",
    ],
    id_vars=["Date", "Poll", "Spread", "race"],
).dropna(subset="value")

In [26]:
src_melted["state"] = src_melted["race"].str.split("/", expand=True)[0].str.upper()

In [27]:
src_melted[["candidate", "party"]] = src_melted["variable"].str.split(
    " \(", expand=True
)

In [28]:
src_melted[["party", "incumbent"]] = src_melted["party"].str.split(")", expand=True)

In [29]:
src_melted.drop(["race", "variable", "Spread", "Poll"], axis=1, inplace=True)

In [30]:
src_melted.head()

Unnamed: 0,Date,value,state,candidate,party,incumbent
104,10/30 - 11/5,51.8,OH,Vance,R,
117,10/30 - 11/5,43.8,OH,Ryan,D,
157,10/24 - 11/1,52.0,MO,Schmitt,R,
170,10/24 - 11/1,41.3,MO,Valentine,D,
262,11/1 - 11/7,48.3,AZ,Masters,R,


In [31]:
src_wide = src_melted.pivot_table(
    index=["state", "Date"], values="value", columns="party"
).reset_index()

In [32]:
src_wide.columns = src_wide.columns.str.lower()

In [33]:
src_wide["year"] = "2022"

In [34]:
postal_to_name = us.states.mapping("abbr", "name")
src_wide["state"] = src_wide["state"].map(postal_to_name)

In [35]:
df = src_wide.drop(["date"], axis=1).copy()

In [36]:
df.rename(columns={"d": "dem_polling", "r": "gop_polling"}, inplace=True)

In [37]:
df["dem_polling_margin"] = (df["dem_polling"] - df["gop_polling"]).round(2)
df["gop_polling_margin"] = (df["gop_polling"] - df["dem_polling"]).round(2)

In [38]:
df["description"] = "RCP polling average"

In [40]:
df["date"] = today

---

#### How have things changed? 

In [41]:
df.head()

party,state,dem_polling,gop_polling,year,dem_polling_margin,gop_polling_margin,description,date
0,Arizona,48.0,48.3,2022,-0.3,0.3,RCP polling average,2022-11-09
1,Colorado,50.0,44.3,2022,5.7,-5.7,RCP polling average,2022-11-09
2,Florida,43.6,52.4,2022,-8.8,8.8,RCP polling average,2022-11-09
3,Georgia,47.4,48.8,2022,-1.4,1.4,RCP polling average,2022-11-09
4,Missouri,41.3,52.0,2022,-10.7,10.7,RCP polling average,2022-11-09


In [42]:
old_df = pd.read_csv("data/processed/2022_polling_average_states_RCP_oct_5.csv")

In [43]:
old_df["date"] = "2022-10-05"

In [44]:
thennow_df = pd.concat([old_df, df]).reset_index(drop=True)

In [45]:
thennow_df["display_date"] = pd.to_datetime(thennow_df["date"]).dt.strftime("%b. %-d")
# thennow_df["date"] = pd.to_datetime(thennow_df["date"])

In [46]:
thennow_df.head()

Unnamed: 0,state,dem_polling,gop_polling,year,dem_polling_margin,gop_polling_margin,description,date,display_date
0,Arizona,48.7,44.8,2022,3.9,-3.9,RCP polling average,2022-10-05,Oct. 5
1,Colorado,47.0,38.0,2022,9.0,-9.0,RCP polling average,2022-10-05,Oct. 5
2,Connecticut,53.0,38.7,2022,14.3,-14.3,RCP polling average,2022-10-05,Oct. 5
3,Florida,43.0,47.0,2022,-4.0,4.0,RCP polling average,2022-10-05,Oct. 5
4,Georgia,48.0,44.2,2022,3.8,-3.8,RCP polling average,2022-10-05,Oct. 5


In [47]:
chart = (
    alt.Chart()
    .mark_line(color="#1851ac")
    .encode(
        x=alt.X("display_date:O", title="", sort="-x"),
        y=alt.Y("dem_polling_margin", title="", axis=alt.Axis()),
    )
)

text = (
    alt.Chart()
    .mark_text(dx=0, dy=-8, color="black")
    .encode(
        x=alt.X("display_date", sort="-x"),
        y=alt.Y("dem_polling_margin"),
        text=alt.Text("dem_polling_margin"),
    )
)

alt.layer(chart, text, data=thennow_df,).properties(width=100, height=120,).facet(
    facet=alt.Facet("state", title=" "), columns=7
).properties(
    title="Percentage point change in Democrats' polling average margin since Oct. 5"
)

  for col_name, dtype in df.dtypes.iteritems():


## Exports

In [None]:
df.to_csv(f"data/processed/2022_polling_average_states_RCP_{today}.csv", index=False)
df.to_csv("data/processed/2022_polling_average_states_RCP.csv", index=False)