# RealClearPolitics: 2018 Senate polls

#### Import Python tools

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import geopandas as gpd
import altair as alt
import altair_stiles as altstiles
import numpy as np
import us
import urllib.request, json
import glob
import os
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import os
import time

In [3]:
alt.themes.register("stiles", altstiles.theme)
alt.themes.enable("stiles")

ThemeRegistry.enable('grid')

In [4]:
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000
alt.data_transformers.disable_max_rows()
pd.set_option("display.max_colwidth", None)

In [5]:
today = pd.to_datetime("today").strftime("%Y-%m-%d")

---

## Harvest data 

#### First, get links to all the senate races

In [6]:
senate_url = "https://www.realclearpolitics.com/epolls/2018/senate/2018_elections_senate_map.html"

#### Invoke Chromedriver session

In [7]:
path = "/Users/stiles/github/chromedriver"
s = Service(path)
driver = webdriver.Chrome(service=s)

#### Get the Senate content from the page

In [8]:
driver.get(senate_url)
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")

#### Find all the links on the page and clean up the urls

In [10]:
links = soup.findAll("a", href=True)

In [11]:
link_list = []

for link in links:
    link_list.append(link["href"].replace("https://www.realclearpolitics.com", ""))

#### There are tons of links on the page. We want only those with this `/epolls/2018/senate/` construction

In [12]:
filtered_list = list(filter(lambda k: "/epolls/2018/senate/" in k, link_list))

#### A few links we don't need remain. They contain these strings. 

In [13]:
exclude_from_list = [
    "create_your_own_senate_map",
    "2018_elections_senate_map",
]

#### Remove them from our url list and define a new list

In [78]:
remove = [i for i in filtered_list if any(i for j in exclude_from_list if str(j) in i)]

In [79]:
nu_links = list(filter(lambda x: x not in remove, filtered_list))

#### Remove any dupes

In [17]:
links = list(set(nu_links))

#### Should be 35 in the list

In [18]:
len(links)

35

---

## Get page contents

#### Launch another service

In [None]:
page_path = "/Users/stiles/github/chromedriver"
page_service = Service(page_path)
page_driver = webdriver.Chrome(service=page_service)

#### Loop over the links, grab the first table on each page and put them all into a list of dataframes (and assign the url to a column for use later)

In [26]:
dfs = []

for link in links:
    page_driver.get("https://www.realclearpolitics.com" + link)
    html = page_driver.page_source
    soup = BeautifulSoup(html, "html.parser")
    dfs.append((pd.read_html(page_driver.page_source)[0]).assign(race=link))

#### One big dataframe

In [27]:
df = pd.concat(dfs)

#### Strip our all the URL bits from the race column

In [28]:
df["race"] = (
    df["race"]
    .str.replace(
        "https://www.realclearpolitics.com/epolls/2018/senate/", "", regex=False
    )
    .str.replace(".html", "", regex=False)
)
df.drop(["Sample", "MoE"], axis=1, inplace=True)

#### Just get the averages from the tables (which contain the attribute "RCP Average", not the polling outfit's name)

In [29]:
src = df[df["Poll"].str.contains("RCP")]

In [30]:
src.columns

Index(['Poll', 'Date', 'McSally (R)', 'Sinema (D)', 'Spread', 'race',
       'Cramer (R)', 'Heitkamp (D)', 'Brown (D)', 'Renacci (R)', 'Cardin (D)',
       'Campbell (R)', 'Simon (I)', 'Tester (D)', 'Rosendale (R)',
       'Hawley (R)', 'McCaskill (D)', 'Donnelly (D)', 'Braun (R)',
       'Brenton (L)', 'Smith (D)', 'Housley (R)', 'Murphy (D)', 'Corey (R)',
       'Menendez (D)', 'Hugin (R)', 'Rosen (D)', 'Heller (R)', 'Casey (D)',
       'Barletta (R)', 'Cruz (R)', 'O'Rourke (D)', 'Barrasso (R)',
       'Trauner (D)', 'Fischer (R)', 'Raybould (D)', 'Stabenow (D)',
       'James (R)', 'Cantwell (D)', 'Hutchison (R)', 'Whitehouse (D)',
       'Flanders (R)', 'Sanders (I)', 'Zupan (R)', 'Baldwin (D)', 'Vukmir (R)',
       'Hyde-Smith (R)', 'Espy (D)', 'Gillibrand (D)', 'Farley (R)',
       'Wicker (R)', 'Baria (D)', 'King (I)', 'Brakey (R)', 'Feinstein (D)',
       'Leon (D)', 'Klobuchar (D)', 'Newberger (R)', 'Heinrich (D)',
       'Rich (R)', 'Johnson (L)', 'Nelson (D)', 'Scott (R)', '

#### Melt the dataframe so it's useful

In [84]:
# Would prefer a more durable solution here than pasting in the candidate names.
# Like maybe filtering out columns we don't need from src.columns and passing a list of what's left to the value_vars argument.
# But this is a past race so the columns won't change. And it works.

src_melted = src.melt(
    value_vars=[
        "McSally (R)",
        "Sinema (D)",
        "Cramer (R)",
        "Heitkamp (D)",
        "Brown (D)",
        "Renacci (R)",
        "Cardin (D)",
        "Campbell (R)",
        "Simon (I)",
        "Tester (D)",
        "Rosendale (R)",
        "Hawley (R)",
        "McCaskill (D)",
        "Donnelly (D)",
        "Braun (R)",
        "Brenton (L)",
        "Smith (D)",
        "Housley (R)",
        "Murphy (D)",
        "Corey (R)",
        "Menendez (D)",
        "Hugin (R)",
        "Rosen (D)",
        "Heller (R)",
        "Casey (D)",
        "Barletta (R)",
        "Cruz (R)",
        "O'Rourke (D)",
        "Barrasso (R)",
        "Trauner (D)",
        "Fischer (R)",
        "Raybould (D)",
        "Stabenow (D)",
        "James (R)",
        "Cantwell (D)",
        "Hutchison (R)",
        "Whitehouse (D)",
        "Flanders (R)",
        "Sanders (I)",
        "Zupan (R)",
        "Baldwin (D)",
        "Vukmir (R)",
        "Hyde-Smith (R)",
        "Espy (D)",
        "Gillibrand (D)",
        "Farley (R)",
        "Wicker (R)",
        "Baria (D)",
        "King (I)",
        "Brakey (R)",
        "Feinstein (D)",
        "Leon (D)",
        "Klobuchar (D)",
        "Newberger (R)",
        "Heinrich (D)",
        "Rich (R)",
        "Johnson (L)",
        "Nelson (D)",
        "Scott (R)",
        "Carper (D)",
        "Arlett (R)",
        "Hirono (D)",
        "Curtis (R)",
        "Warren (D)",
        "Diehl (R)",
        "Blackburn (R)",
        "Bredesen (D)",
        "Manchin (D)",
        "Morrisey (R)",
        "Kaine (D)",
        "Stewart (R)",
        "Romney (R)",
        "Wilson (D)",
    ],
    id_vars=["Date", "Poll", "Spread", "race"],
).dropna(subset="value")

In [81]:
src_melted.head()

Unnamed: 0,Date,value,state,candidate,party,incumbent
0,10/24 - 11/5,47.5,AZ,McSally,R,
16,10/24 - 11/5,46.5,AZ,Sinema,D,
33,10/23 - 10/30,52.5,ND,Cramer,R,
49,10/23 - 10/30,43.5,ND,Heitkamp,D,
146,10/24 - 11/5,49.0,MT,Tester,D,


#### Get the states by splitting the race URL again

In [63]:
src_melted["state"] = (
    src_melted["race"]
    .str.split("/epolls/2018/senate/", expand=True)[1]
    .str[:2]
    .str.upper()
)

#### And candidate/party from the variable column of names and parties

In [64]:
src_melted[["candidate", "party"]] = src_melted["variable"].str.split(
    " \(", expand=True
)

#### Get the incumbent flag too

In [65]:
src_melted[["party", "incumbent"]] = src_melted["party"].str.split(")", expand=True)

#### Just the columns we need

In [66]:
src_melted.drop(["race", "variable", "Spread", "Poll"], axis=1, inplace=True)

In [67]:
src_melted.head()

Unnamed: 0,Date,value,state,candidate,party,incumbent
0,10/24 - 11/5,47.5,AZ,McSally,R,
16,10/24 - 11/5,46.5,AZ,Sinema,D,
33,10/23 - 10/30,52.5,ND,Cramer,R,
49,10/23 - 10/30,43.5,ND,Heitkamp,D,
146,10/24 - 11/5,49.0,MT,Tester,D,


#### Now we can stretch out the dataframe for easier analysis

In [68]:
src_wide = src_melted.pivot_table(
    index=["state", "Date"], values="value", columns="party"
).reset_index()

In [69]:
src_wide.columns = src_wide.columns.str.lower()

In [70]:
src_wide["year"] = "2018"

#### State names

In [71]:
postal_to_name = us.states.mapping("abbr", "name")
src_wide["state"] = src_wide["state"].map(postal_to_name)

#### Remove/rename columns

In [72]:
df = src_wide.drop(["l", "date"], axis=1).copy()

In [73]:
df.rename(columns={"d": "dem_polling", "r": "gop_polling"}, inplace=True)

#### Calculate margins

In [74]:
df["dem_polling_margin"] = (df["dem_polling"] - df["gop_polling"]).round(2)
df["gop_polling_margin"] = (df["gop_polling"] - df["dem_polling"]).round(2)

#### Add a description

In [75]:
df["description"] = "RCP polling average"

In [76]:
df

party,state,dem_polling,gop_polling,year,dem_polling_margin,gop_polling_margin,description
0,Arizona,46.5,47.5,2018,-1.0,1.0,RCP polling average
1,Florida,48.8,46.4,2018,2.4,-2.4,RCP polling average
2,Indiana,44.0,43.3,2018,0.7,-0.7,RCP polling average
3,Massachusetts,55.8,31.0,2018,24.8,-24.8,RCP polling average
4,Michigan,52.0,43.7,2018,8.3,-8.3,RCP polling average
5,Missouri,46.2,46.8,2018,-0.6,0.6,RCP polling average
6,Montana,49.0,45.7,2018,3.3,-3.3,RCP polling average
7,North Dakota,43.5,52.5,2018,-9.0,9.0,RCP polling average
8,New Jersey,51.0,40.3,2018,10.7,-10.7,RCP polling average
9,New Mexico,50.3,33.3,2018,17.0,-17.0,RCP polling average


---

## Export

In [77]:
df.to_csv("data/processed/2018_polling_average_states_RCP.csv", index=False)