# FiveThirtyEight: 2016 state polls

#### Import Python tools

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import geopandas as gpd
import altair as alt
import altair_stiles as altstiles
import numpy as np
import us
import urllib.request, json
import glob
import os
import requests
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import os
import time

In [3]:
alt.themes.register("stiles", altstiles.theme)
alt.themes.enable("grid")

ThemeRegistry.enable('grid')

In [4]:
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000
alt.data_transformers.disable_max_rows()
pd.set_option("display.max_colwidth", None)

In [5]:
today = pd.to_datetime("today").strftime("%Y-%m-%d")

---

## Harvest data 

#### First get a list of state names

In [6]:
all_features = []

for st in us.states.STATES:
    states_dict = {
        "fips": st.fips,
        "name": st.name,
        "abbr": st.abbr,
        "is_continental": st.is_continental,
        "statehood_year": st.statehood_year,
        "capital": st.capital,
        "capital_tz": st.capital_tz,
        "ap_abbr": st.ap_abbr,
        "shapefile_urls": st.shapefile_urls(),
    }
    all_features.append(states_dict)

In [7]:
states = (
    pd.DataFrame(all_features)
    .sort_values("fips", ascending=True)
    .reset_index(drop=True)
)

In [8]:
states = list(states["name"].str.lower().str.replace(" ", "-"))

---

#### Loop through list of states to download json data about each

In [9]:
path = "/Users/stiles/github/chromedriver"
s = Service(path)
driver = webdriver.Chrome(service=s)

In [None]:
data = []
dfs = []

for state in states:
    state_url = f"https://projects.fivethirtyeight.com/2016-election-forecast/{state}/"
    driver.get(state_url)
    time.sleep(1)
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")
    table = soup.find("table", {"class": "t-calc"})

    for row in table.select("tbody tr"):
        data.append(
            dict(
                zip(
                    [x.text for x in soup.select("thead th")],
                    [x.text.strip() for x in row.select("th,td")],
                ),
                state_name=state,
            )
        )

In [None]:
df = pd.DataFrame(data)

In [None]:
df.columns = [
    "drop",
    "description",
    "dem_polling",
    "gop_polling",
    "other_polling",
    "extra",
    "state",
    "drop",
]

In [None]:
df = df[df["description"] == "Polling average"].drop(["drop", "extra"], axis=1)

In [None]:
df["dem_polling"] = df["dem_polling"].str.replace("%-", "", regex=False).astype(float)
df["gop_polling"] = df["gop_polling"].str.replace("%-", "", regex=False).astype(float)
df["other_polling"] = (
    df["other_polling"].str.replace("%-", "", regex=False).astype(float)
)

In [None]:
df = df.drop_duplicates().reset_index(drop=True)

In [None]:
df["state"] = df["state"].str.replace("-", "").str.title()

In [None]:
df["year"] = "2016"

In [None]:
df["gop_polling_margin"] = (df["gop_polling"] - df["dem_polling"]).round(2)
df["dem_polling_margin"] = (df["dem_polling"] - df["gop_polling"]).round(2)

In [None]:
df = df[
    [
        "state",
        "description",
        "gop_polling",
        "dem_polling",
        "other_polling",
        "gop_polling_margin",
        "dem_polling_margin",
        "year",
    ]
]

In [None]:
df["description"] = "538 polling average"

In [None]:
df.to_csv("data/processed/2016_polling_average_states_538.csv", index=False)