# How did polling averages and results differ in 2020?

#### Import Python tools

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import us
import tabula
import altair as alt
import altair_stiles as altstiles
import numpy as np

In [3]:
alt.themes.register("stiles", altstiles.theme)
alt.themes.enable("stiles")

ThemeRegistry.enable('grid')

In [4]:
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000

---

## Read data

#### Make a list of the most 'competetive' states from 2020, [according to Inside Elections](http://www.insideelections.com/ratings/president/2020-presidential-ratings-october-28-2020)

In [5]:
competetive_src = pd.read_csv(
    "https://docs.google.com/spreadsheets/d/e/2PACX-1vStoirfqFdugAT8mfQFlmbVzgm8IKA2GS1_nfsysTMp2oXx7SpR6Sz5MiNoaRYPPcB5Fz7ZiN3Hx35U/pub?gid=0&single=true&output=csv",
    dtype={"year": str},
)

In [6]:
competetive = list(competetive_src[competetive_src["year"] == "2020"]["state"])

#### Polls

In [7]:
polls = pd.read_csv("data/processed/2020_polling_average_states_538.csv")

In [8]:
polls.head()

Unnamed: 0,state,gop_polling,dem_polling,gop_polling_margin,dem_polling_margin,year,description
0,Alabama,57.36,37.83,19.53,-19.53,2020,538 polling average
1,Alaska,51.23,43.57,7.66,-7.66,2020,538 polling average
2,Arizona,46.1,48.71,-2.6,2.6,2020,538 polling average
3,Arkansas,58.95,36.18,22.77,-22.77,2020,538 polling average
4,California,32.44,61.62,-29.19,29.19,2020,538 polling average


#### Results

In [9]:
results = pd.read_csv("data/processed/2020_election_results_states_fec.csv")

In [10]:
results.head()

Unnamed: 0,state,gop_vote_pct,dem_vote_pct,other_vote_pct,year
0,Alabama,62.03,36.57,1.4,2020
1,Alaska,52.83,42.77,4.39,2020
2,Arizona,49.06,49.36,1.58,2020
3,Arkansas,62.4,34.78,2.83,2020
4,California,34.32,63.48,2.2,2020


#### Merge 'em

In [11]:
df = pd.merge(polls, results, on=["state", "year"])

#### Just the columns we need

In [12]:
df = df[["year", "state", "gop_polling", "gop_vote_pct", "dem_polling", "dem_vote_pct"]]

#### Difference between polls and vote, by party

In [13]:
df["gop_poll_vote_diff"] = df["gop_polling"] - df["gop_vote_pct"]
df["dem_poll_vote_diff"] = df["dem_polling"] - df["dem_vote_pct"]

#### Who's the winner

In [14]:
df["winner"] = (
    df[["gop_vote_pct", "dem_vote_pct"]]
    .idxmax(axis=1)
    .str.replace("_vote_pct", "")
    .str.upper()
)

#### How off were the polls for each party?

In [15]:
df.gop_poll_vote_diff.mean().round(2)

-4.11

In [16]:
df.dem_poll_vote_diff.mean().round(2)

1.17

---

#### Just the competetive states

In [17]:
competetive_df = df[df["state"].isin(competetive)].copy()

#### How off were the polls for each party in competetive states?

In [18]:
competetive_df.gop_poll_vote_diff.mean().round(2)

-3.92

In [19]:
competetive_df.dem_poll_vote_diff.mean().round(2)

1.06

#### More margins

In [20]:
competetive_df["gop_polling_margin"] = (
    competetive_df["gop_polling"] - competetive_df["dem_polling"]
)
competetive_df["dem_polling_margin"] = (
    competetive_df["dem_polling"] - competetive_df["gop_polling"]
)
competetive_df["other_polling_pct"] = 100 - (
    competetive_df["dem_polling"] + competetive_df["gop_polling"]
)
competetive_df["dem_vote_margin"] = (
    competetive_df["gop_vote_pct"] - competetive_df["dem_vote_pct"]
)
competetive_df["dem_vote_margin"] = (
    competetive_df["dem_vote_pct"] - competetive_df["gop_vote_pct"]
)
competetive_df["other_vote_pct"] = 100 - (
    competetive_df["dem_vote_pct"] + competetive_df["gop_vote_pct"]
)

In [21]:
competetive_df[competetive_df["state"] == "Alaska"]

Unnamed: 0,year,state,gop_polling,gop_vote_pct,dem_polling,dem_vote_pct,gop_poll_vote_diff,dem_poll_vote_diff,winner,gop_polling_margin,dem_polling_margin,other_polling_pct,dem_vote_margin,other_vote_pct
1,2020,Alaska,51.23,52.83,43.57,42.77,-1.6,0.8,GOP,7.66,-7.66,5.2,-10.06,4.4


---

#### Share data for stacked horizontal bars, grouped by state

In [22]:
competetive_df.columns

Index(['year', 'state', 'gop_polling', 'gop_vote_pct', 'dem_polling',
       'dem_vote_pct', 'gop_poll_vote_diff', 'dem_poll_vote_diff', 'winner',
       'gop_polling_margin', 'dem_polling_margin', 'other_polling_pct',
       'dem_vote_margin', 'other_vote_pct'],
      dtype='object')

In [23]:
competetive_df = competetive_df[
    [
        "state",
        "winner",
        "gop_vote_pct",
        "dem_vote_pct",
        "other_vote_pct",
        "gop_polling",
        "dem_polling",
        "other_polling_pct",
    ]
].copy()

In [24]:
df_long = pd.melt(
    competetive_df,
    id_vars=["state", "winner"],
    value_vars=[
        "gop_vote_pct",
        "dem_vote_pct",
        "other_vote_pct",
        "gop_polling",
        "dem_polling",
        "other_polling_pct",
    ],
    var_name="measure",
    value_name="value",
)

In [25]:
df_long[df_long["state"] == "Alaska"]

Unnamed: 0,state,winner,measure,value
0,Alaska,GOP,gop_vote_pct,52.83
19,Alaska,GOP,dem_vote_pct,42.77
38,Alaska,GOP,other_vote_pct,4.4
57,Alaska,GOP,gop_polling,51.23
76,Alaska,GOP,dem_polling,43.57
95,Alaska,GOP,other_polling_pct,5.2


In [26]:
df_long["type"] = np.where(
    df_long["measure"].str.contains("polling"), "Polling", "Vote"
)

In [27]:
df_long["measure"] = (
    df_long["measure"]
    .str.replace("_vote_pct", "")
    .str.replace("_polling_pct", "")
    .str.replace("_polling", "")
    .str.upper()
)

In [28]:
df_long_slim = df_long.copy()

In [29]:
def set_order(df):
    if df["measure"] == "GOP":
        return 1
    elif df["measure"] == "DEM":
        return 3
    else:
        return 2

In [30]:
df_long_slim["order"] = df_long_slim.apply(set_order, axis=1)

In [31]:
ticks = dict(
    zip(
        df_long_slim[
            (df_long_slim["type"] == "Polling") & (df_long_slim["measure"] == "GOP")
        ]["state"],
        (
            df_long_slim[
                (df_long_slim["type"] == "Polling") & (df_long_slim["measure"] == "GOP")
            ]["value"]
            / 100
        ).round(4),
    )
)

In [32]:
ticks

{'Alaska': 0.5123,
 'Arizona': 0.461,
 'Florida': 0.4662,
 'Georgia': 0.4737,
 'Iowa': 0.476,
 'Kansas': 0.5386,
 'Michigan': 0.4323,
 'Minnesota': 0.4267,
 'Missouri': 0.5161,
 'Montana': 0.4979,
 'Nevada': 0.4436,
 'New Hampshire': 0.4283,
 'North Carolina': 0.4712,
 'Ohio': 0.4753,
 'Pennsylvania': 0.4555,
 'South Carolina': 0.5159,
 'Texas': 0.4859,
 'Utah': 0.5192,
 'Wisconsin': 0.4371}

In [33]:
df_long_slim["tick"] = df_long_slim["state"].map(ticks)

In [34]:
df_long_slim[df_long_slim["state"] == "Alaska"]

Unnamed: 0,state,winner,measure,value,type,order,tick
0,Alaska,GOP,GOP,52.83,Vote,1,0.5123
19,Alaska,GOP,DEM,42.77,Vote,3,0.5123
38,Alaska,GOP,OTHER,4.4,Vote,2,0.5123
57,Alaska,GOP,GOP,51.23,Polling,1,0.5123
76,Alaska,GOP,DEM,43.57,Polling,3,0.5123
95,Alaska,GOP,OTHER,5.2,Polling,2,0.5123


In [35]:
line = alt.Chart(pd.DataFrame({"y": [1]})).mark_rule().encode(y="y")

bars = (
    alt.Chart(df_long_slim)
    .mark_bar()
    .encode(
        x=alt.X(
            "value:Q",
            title="",
            stack="normalize",
            axis=alt.Axis(format="%", tickCount=3),
        ),
        y=alt.Y("type:N", title=""),
        color=alt.Color(
            "measure",
            legend=None,
            sort=["GOP", "OTHER", "DEM"],
            scale=alt.Scale(
                domain=["GOP", "OTHER", "DEM"], range=["#e56a55", "#ffffcc", "#5d94d6"]
            ),
        ),
        # facet=alt.Facet("state", columns=4),
        order="order",
    )
)

rule = (
    alt.Chart()
    .mark_rule(color="#1a1a1a", strokeDash=[2, 3])
    .encode(x="tick", order="order")
)

alt.layer(bars, rule, data=df_long_slim).properties(height=50, width=180).facet(
    facet=alt.Facet(
        "state", title=" ", header=alt.Header(labelFontSize=14, labelFont="Summit Sans")
    ),
    columns=4,
)

In [49]:
bars = (
    alt.Chart(df_long_slim)
    .mark_bar()
    .encode(
        x=alt.X(
            "value:Q",
            title="",
            stack="normalize",
            axis=alt.Axis(format="%", tickCount=3, values=[0.50]),
        ),
        y=alt.Y("type:N", title=""),
        color=alt.Color(
            "measure",
            legend=None,
            sort=["GOP", "OTHER", "DEM"],
            scale=alt.Scale(
                domain=["GOP", "OTHER", "DEM"], range=["#e56a55", "#ffffcc", "#5d94d6"]
            ),
        ),
        # facet=alt.Facet("state", columns=4),
        order="order",
    )
)

rule = (
    alt.Chart()
    .mark_rule(color="#1a1a1a", strokeDash=[2, 3])
    .encode(x="tick", order="order")
)

alt.layer(bars, rule, data=df_long_slim).properties(height=40, width=135).facet(
    facet=alt.Facet(
        "state", title=" ", header=alt.Header(labelFontSize=14, labelFont="Summit Sans")
    ),
    columns=4,
)

#### Export

In [37]:
competetive_df.to_csv("data/processed/competetive_races_2020.csv", index=False)
df_dw[["state", "type", "GOP", "OTHER", "DEM"]].to_csv(
    "data/processed/competetive_races_2020_DW_test.csv", index=False
)

NameError: name 'df_dw' is not defined