# RealClearPolitics: 2022 Senate polls

#### Import Python tools

In [1]:
%load_ext lab_black

In [35]:
import pandas as pd
import geopandas as gpd
import altair as alt
import altair_grid as altgrid
import numpy as np
import us
import urllib.request, json
import glob
import os
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import os
import time

In [2]:
alt.themes.register("grid", altgrid.theme)
alt.themes.enable("grid")

ThemeRegistry.enable('grid')

In [3]:
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000
alt.data_transformers.disable_max_rows()
pd.set_option("display.max_colwidth", None)

In [4]:
today = pd.to_datetime("today").strftime("%Y-%m-%d")

---

## Harvest data 

#### First, get all the poll page urls from the 2022 landing page

In [30]:
url = 'https://www.realclearpolitics.com/epolls/latest_polls/senate/'

In [36]:
path = "/Users/stiles/github/chromedriver"
s = Service(path)
driver = webdriver.Chrome(service=s)

In [37]:
url = 'https://www.realclearpolitics.com/epolls/latest_polls/senate/'
driver.get(url)
html = driver.page_source

In [41]:
soup = BeautifulSoup(html, "html.parser")
links = soup.findAll("a")

In [58]:
data = soup.findAll('td',attrs={'class':'lp-race'})

In [66]:
links = []

for td in data:
    page_links = td.findAll('a')
    for a in page_links:
        links.append("https://www.realclearpolitics.com" + a['href'])

In [70]:
links = list(set(links))

In [85]:
path = "/Users/stiles/github/chromedriver"
s = Service(path)
page_driver = webdriver.Chrome(service=s)

In [126]:
dfs = []

for page_link in links:
    page_driver.get(page_link)
    page_html = page_driver.page_source
    page_soup = BeautifulSoup(page_html, "html.parser")
    dfs.append((pd.read_html(page_driver.page_source)[0]).assign(race=page_link))

In [127]:
df = pd.concat(dfs)

In [134]:
df['race'] = df['race'].str.replace('https://www.realclearpolitics.com/epolls/2022/senate/', '', regex=False).str.replace('.html', '', regex=False)
df.drop(['Sample', 'MoE'], axis=1, inplace=True)

In [142]:
src = df[df['Poll'].str.contains('RCP')]

In [260]:
src_melted = src.melt(value_vars=['Lee (R) *', 'McMullin (I)',
       'Wyden (D) *', 'Perkins (R)', 'Blumenthal (D) *', 'Levy (R)', 'Bolduc',
       'Morse', 'Smith', 'Mansharamani', 'Fenton', 'Boozman (R) *',
       'James (D)', 'Murray (D) *', 'Smiley (R)', 'Duckworth (D) *',
       'Salvi (R)', 'Warnock (D) *', 'Walker (R)', 'Van Hollen (D) *',
       'Chaffee (R)', 'Schmitt (R)', 'Valentine (D)', 'Johnson (R) *',
       'Barnes (D)', 'Vance (R)', 'Ryan (D)', 'Budd (R)', 'Beasley (D)',
       'Laxalt (R)', 'Cortez Masto (D) *', 'Welch (D)', 'Malloy (R)',
       'Fetterman (D)', 'Oz (R)', 'Hassan (D) *', 'Bolduc (R)', 'Rubio (R) *',
       'Demings (D)', 'Moran (R) *', 'Holland (D)', 'Kelly (D) *',
       'Masters (R)', 'Bennet (D) *', 'O\'Dea (R)', 'Lankford (R) *',
       'Horn (D)', 'Tshibaka (R)', 'Murkowski (R) *', 'Chesbro (D)',
       'Schumer (D) *', 'Pinion (R)', 'Mullin (R)'], id_vars=['Date', 'Poll', 'Spread', 'race']).dropna(subset='value')

In [261]:
src_melted['state'] = src_melted['race'].str.split('/', expand=True)[0].str.upper()

In [262]:
src_melted[['candidate', 'party']] = src_melted['variable'].str.split(' \(', expand=True)

In [263]:
src_melted[['party', 'incumbent']] = src_melted['party'].str.split(")", expand=True)

In [264]:
src_melted.drop(['race', 'variable', 'Spread', 'Poll'], axis=1, inplace=True)

In [265]:
src_melted.head()

Unnamed: 0,Date,value,state,candidate,party,incumbent
56,9/7 - 9/21,53.0,CT,Blumenthal,D,*
70,9/7 - 9/21,38.7,CT,Levy,R,
183,9/12 - 10/1,50.5,WA,Murray,D,*
197,9/12 - 10/1,41.8,WA,Smiley,R,
240,9/12 - 10/4,48.0,GA,Warnock,D,*


In [280]:
src_wide = src_melted.pivot_table(index=['state', 'Date'], values='value', columns='party').reset_index()

In [281]:
src_wide.columns = src_wide.columns.str.lower()

In [282]:
src_wide['year'] = '2022'

In [284]:
postal_to_name = us.states.mapping("abbr", "name")
src_wide["state"] = src_wide["state"].map(postal_to_name)

In [285]:
df = src_wide.drop(["date"], axis=1).copy()

In [286]:
df.rename(columns={"d": "dem_polling", "r": "gop_polling"}, inplace=True)

In [291]:
df["dem_polling_margin"] = (df["dem_polling"] - df["gop_polling"]).round(2)
df["gop_polling_margin"] = (df["gop_polling"] - df["dem_polling"]).round(2)

In [292]:
df["description"] = "RCP polling average"

In [293]:
df

party,state,dem_polling,gop_polling,year,dem_polling_margin,gop_polling_margin,description
0,Arizona,48.7,44.8,2022,3.9,-3.9,RCP polling average
1,Colorado,47.0,38.0,2022,9.0,-9.0,RCP polling average
2,Connecticut,53.0,38.7,2022,14.3,-14.3,RCP polling average
3,Florida,43.0,47.0,2022,-4.0,4.0,RCP polling average
4,Georgia,48.0,44.2,2022,3.8,-3.8,RCP polling average
5,Missouri,38.0,49.0,2022,-11.0,11.0,RCP polling average
6,North Carolina,43.5,45.0,2022,-1.5,1.5,RCP polling average
7,New Hampshire,49.2,42.6,2022,6.6,-6.6,RCP polling average
8,Nevada,43.2,45.4,2022,-2.2,2.2,RCP polling average
9,New York,54.0,33.5,2022,20.5,-20.5,RCP polling average


In [294]:
df.to_csv("data/processed/2022_polling_average_states_RCP.csv", index=False)