# How did polling averages and results differ in 2020?

#### Import Python tools

In [1]:
%load_ext lab_black

In [73]:
import pandas as pd
import us
import tabula
import altair as alt
import altair_stiles as altstiles
import numpy as np

In [3]:
alt.themes.register("stiles", altstiles.theme)
alt.themes.enable("grid")

ThemeRegistry.enable('grid')

In [4]:
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000

---

## Read data

#### Make a list of the most 'competetive' states from 2020, [according to Inside Elections](http://www.insideelections.com/ratings/president/2020-presidential-ratings-october-28-2020)

In [306]:
competetive_src = pd.read_csv(
    "https://docs.google.com/spreadsheets/d/e/2PACX-1vStoirfqFdugAT8mfQFlmbVzgm8IKA2GS1_nfsysTMp2oXx7SpR6Sz5MiNoaRYPPcB5Fz7ZiN3Hx35U/pub?gid=0&single=true&output=csv",
    dtype={"year": str},
)

In [6]:
competetive = list(competetive_src[competetive_src["year"] == "2020"]["state"])

In [7]:
competetive

['Iowa',
 'Ohio',
 'Texas',
 'Arizona',
 'Florida',
 'Georgia',
 'North Carolina',
 'Michigan',
 'Pennsylvania',
 'Wisconsin',
 'Minnesota',
 'New Hampshire',
 'Nevada',
 'Alaska',
 'Kansas',
 'Missouri',
 'Montana',
 'South Carolina',
 'Utah']

#### Polls

In [8]:
polls = pd.read_csv("data/processed/2020_polling_average_states_538.csv")

In [307]:
polls

Unnamed: 0,state,gop_polling,dem_polling,gop_polling_margin,dem_polling_margin,year,description
0,Alabama,57.36,37.83,19.53,-19.53,2020,538 polling average
1,Alaska,51.23,43.57,7.66,-7.66,2020,538 polling average
2,Arizona,46.1,48.71,-2.6,2.6,2020,538 polling average
3,Arkansas,58.95,36.18,22.77,-22.77,2020,538 polling average
4,California,32.44,61.62,-29.19,29.19,2020,538 polling average
5,Colorado,41.15,53.6,-12.45,12.45,2020,538 polling average
6,Connecticut,32.37,58.62,-26.26,26.26,2020,538 polling average
7,Delaware,34.6,58.86,-24.25,24.25,2020,538 polling average
8,Florida,46.62,49.08,-2.46,2.46,2020,538 polling average
9,Georgia,47.37,48.54,-1.17,1.17,2020,538 polling average


#### Results

In [119]:
results = pd.read_csv("data/processed/2020_election_results_states_fec.csv")

In [120]:
results.head()

Unnamed: 0,state,gop_vote_pct,dem_vote_pct,other_vote_pct,year
0,Alabama,62.03,36.57,1.4,2020
1,Alaska,52.83,42.77,4.39,2020
2,Arizona,49.06,49.36,1.58,2020
3,Arkansas,62.4,34.78,2.83,2020
4,California,34.32,63.48,2.2,2020


In [121]:
df = pd.merge(polls, results, on=["state", "year"])

In [122]:
df = df[["year", "state", "gop_polling", "gop_vote_pct", "dem_polling", "dem_vote_pct"]]

In [123]:
df["gop_poll_vote_diff"] = df["gop_polling"] - df["gop_vote_pct"]
df["dem_poll_vote_diff"] = df["dem_polling"] - df["dem_vote_pct"]

In [124]:
df["winner"] = (
    df[["gop_vote_pct", "dem_vote_pct"]]
    .idxmax(axis=1)
    .str.replace("_vote_pct", "")
    .str.upper()
)

#### How off were the polls for each party?

In [125]:
df.gop_poll_vote_diff.mean().round(2)

-4.11

In [126]:
df.dem_poll_vote_diff.mean().round(2)

1.17

---

#### Just the competetive states

In [127]:
competetive_df = df[df["state"].isin(competetive)].copy()

In [278]:
competetive_df.state.unique()

array(['Alaska', 'Arizona', 'Florida', 'Georgia', 'Iowa', 'Kansas',
       'Michigan', 'Minnesota', 'Missouri', 'Montana', 'Nevada',
       'New Hampshire', 'North Carolina', 'Ohio', 'Pennsylvania',
       'South Carolina', 'Texas', 'Utah', 'Wisconsin'], dtype=object)

#### How off were the polls for each party in competetive states?

In [129]:
competetive_df.gop_poll_vote_diff.mean().round(2)

-3.92

In [130]:
competetive_df.dem_poll_vote_diff.mean().round(2)

1.06

#### More margins

In [131]:
competetive_df["gop_polling_margin"] = (
    competetive_df["gop_polling"] - competetive_df["dem_polling"]
)
competetive_df["dem_polling_margin"] = (
    competetive_df["dem_polling"] - competetive_df["gop_polling"]
)
competetive_df["dem_vote_margin"] = (
    competetive_df["gop_vote_pct"] - competetive_df["dem_vote_pct"]
)
competetive_df["dem_vote_margin"] = (
    competetive_df["dem_vote_pct"] - competetive_df["gop_vote_pct"]
)

---

#### Melt for Leah's bars idea

In [229]:
df_long_test = pd.melt(
    competetive_df,
    id_vars=["state", "winner"],
    value_vars=[
        "dem_vote_margin",
        "gop_vote_margin",
        "dem_polling_margin",
        "gop_polling_margin",
    ],
    var_name="measure",
    value_name="value",
)

In [231]:
df_long_test["type"] = np.where(
    df_long_test["measure"].str.contains("polling"), "Polling margin", "Vote margin"
)

In [232]:
df_long_test["measure"] = (
    df_long_test["measure"]
    .str.replace("_polling_margin", "")
    .str.replace("_vote_margin", "")
    .str.upper()
)

In [233]:
df_long_test["abs"] = df_long_test["value"].abs().round(1)

In [297]:
# states = [
#     "North Carolina",
#     "Ohio",
#     "Wisconsin",
#     "Nevada",
#     "Georgia",
#     "Pennsylvania",
#     "Arizona",
#     "Colorado",
#     "New Hampshire",
#     "Michigan",
#     "Florida",
#     "Minnesota",
#     "South Carolina",
#     "Utah",
#     "Kansas",
#     "West Virgina",
#     "Louisiana",
#     "South Dakota",
#     "Texas",
#     "Alabama",
# ]

In [298]:
# df_long_slim = df_long_test[df_long_test["state"].isin(states)]

In [302]:
df_long_slim = df_long_test.copy()

In [304]:
bars = (
    alt.Chart()
    .mark_bar()
    .encode(
        x=alt.X("value:Q", title=" ", axis=alt.Axis(tickCount=3)),
        y=alt.Y("type", title=""),
        color=alt.condition(
            alt.datum.value > 0,
            alt.value("#1851ac"),
            alt.value("#c62222"),
        ),
    )
)

text = (
    alt.Chart()
    .mark_text(dx=-20, dy=0, color="black")
    .encode(
        x=alt.X("value", sort="-x"),
        y=alt.Y("type"),
        text=alt.Text("value"),
    )
)

alt.layer(bars, text, data=df_long_slim[df_long_slim["measure"] == "DEM"]).properties(
    height=50, width=200
).facet(
    facet=alt.Facet(
        "state", title=" ", header=alt.Header(labelFontSize=14, labelFont="Summit Sans")
    ),
    columns=4,
)

In [305]:
df_long_slim[
    (df_long_slim["measure"] == "DEM") & (df_long_slim["type"] == "Polling margin")
]["value"].mean()

0.07578947368421116

In [296]:
df_long_slim[
    (df_long_slim["measure"] == "GOP") & (df_long_slim["type"] == "Polling margin")
]["value"].mean()

-1.5206666666666673

---

#### Melt the table for competetive races for charting

In [22]:
df_long = pd.melt(
    competetive_df,
    id_vars=["state"],
    value_vars=["dem_diff", "gop_diff"],
    var_name="party",
    value_name="diff_value",
)

In [23]:
df_long["party"] = (
    df_long["party"]
    .str.replace("dem_diff", "Democrats")
    .str.replace("gop_diff", "Republicans")
)

---

## Charts

In [27]:
alt.Chart(df_long).mark_bar().encode(
    x=alt.X("diff_value", title=" ", axis=alt.Axis()),
    y=alt.Y("state", title=" "),
    color=alt.condition(
        alt.datum.diff_value > 0,
        alt.value("#00d4d8"),  # The positive color
        alt.value("#d95f1a"),  # The negative color
    ),
    facet=alt.Facet(
        "party",
        columns=2,
        title=" ",
        header=alt.Header(labelFontSize=15, labelFont="Summit Sans"),
    ),
).properties(
    height=400,
    width=300,
    title="2020 presidential in competetive states: PPT difference between polling and result",
)

---

#### Export

In [25]:
competetive_df.to_csv("data/processed/competetive_races_2020.csv", index=False)