In this analysis, I want to take a look at two data sets from the 2020 election:
1. The Presidential votes by county
2. Election turnout by state

For the visuals, I'll be using Plotly Express.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import datetime as dt

import plotly.graph_objects as go
import plotly.io as pio
import plotly.express as px

In [None]:
president_counties_df = pd.read_csv('../input/us-election-2020/president_county_candidate.csv')

In [None]:
total_votes_df = president_counties_df.groupby('candidate')['total_votes'].sum().reset_index().sort_values(by='total_votes',ascending=False)

Let's first take a look at the popular vote for all the candidates.

In [None]:
df = total_votes_df.sort_values(by='total_votes')

fig1 = px.bar(df, x='total_votes', y='candidate', orientation='h')

fig1.update_layout(template='simple_white', height=800)
fig1.show()


Because the data is so skewed, we can change the visualization by making the x axis a log scale.

In [None]:
df = total_votes_df.sort_values(by='total_votes')

df['text'] = df.apply(lambda x: x['candidate'] + ' ' + "{:,}".format(x['total_votes']), axis=1)

fig1 = px.scatter(df, x='total_votes', y='candidate',log_x=True, text='text')

fig1.update_traces(textposition='middle right')

fig1.update_layout(template='simple_white', height=800, yaxis = dict(showticklabels = False))
fig1.show()


In [None]:
filt = (president_counties_df['won'] == True)# & (president_counties_df['candidate'].isin(['Joe Biden','Donald Trump']))

counties_won_df = president_counties_df.loc[filt]

In [None]:
counties_won_df.groupby('candidate')['county'].count().reset_index()

It's really interesting to see how Trump won two thirds of counties yet did not win the popular vote. The other interesting thing is that there are way more counties than actual counties. Last I checked, there were 3,142 counties in the US. However, there may be something different in how counties are used for election purposes.

In [None]:
len(president_counties_df.groupby(['state','county']).count())

Let's take a look at which counties won by the biggest margin

In [None]:
president_counties_pivot = president_counties_df[president_counties_df['candidate'].isin(['Joe Biden','Donald Trump'])].pivot_table(index=['state','county'],columns='candidate',values='total_votes', aggfunc='sum')

In [None]:
president_counties_pivot['Donald Trump %'] = president_counties_pivot['Donald Trump'] / (president_counties_pivot['Donald Trump'] + president_counties_pivot['Joe Biden'])
president_counties_pivot['Joe Biden %'] = president_counties_pivot['Joe Biden'] / (president_counties_pivot['Donald Trump'] + president_counties_pivot['Joe Biden'])

In [None]:
president_counties_pivot['margin'] = president_counties_pivot['Joe Biden %'] - president_counties_pivot['Donald Trump %']

In [None]:
donald_trump_top_counties = president_counties_pivot.sort_values(by='margin')
donald_trump_top_counties.head(10)

In [None]:
joe_biden_top_counties = president_counties_pivot.sort_values(by='margin', ascending=False)
joe_biden_top_counties.head(10)

# Charting Counties Using Plotly

In [None]:
# The data doesn't have the fips code so I'll need to bring that in
counties_fips = pd.read_csv('../input/covid19-us-county-jhu-data-demographics/us_county.csv', dtype={'fips':'str'})

In [None]:
president_counties_df_fips = pd.merge(president_counties_df,counties_fips,how='inner',left_on = ['state','county'], right_on=['state','county'])

In [None]:
president_counties_df_fips['color'] = president_counties_df_fips.apply(lambda x: 'blue' if (x['party'] == 'DEM') & (x['won'] == True) else 'red', axis=1)

In [None]:
counties_fips_color = president_counties_df_fips[president_counties_df_fips['won'] == True]

In [None]:
counties_fips_color.to_csv('president_counties.csv')

In [None]:
counties_fips_color['fips_2'] = counties_fips_color['fips'].apply(lambda x: '0'+x if len(x) == 4 else x)

In [None]:
counties_fips_color.head()

In [None]:
from urllib.request import urlopen
import json
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)
    
df = counties_fips_color

fig = px.choropleth(df, geojson=counties, locations='fips_2', color='color',
                           scope="usa",
                           hover_data=["state","county", "candidate"]
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0}, showlegend=False)
fig.show()


Some of the states are grey which means there must an issue with some of the counties in a few states.

In [None]:
president_counties_df[president_counties_df['state'] == 'Alaska']['county'].unique()

In [None]:
counties_fips[counties_fips['state'] == 'Alaska']['county'].unique()

As I expected, the county names are different in the data sets... Oh the joy of data science.

# Voter Turnout

Let's take a look at how many people turned up to vote by state

In [None]:
voter_turnout_df = pd.read_csv('../input/2020-us-general-election-turnout-rates/2020 November General Election - Turnout Rates.csv')

In [None]:
voter_turnout_df.head()

In [None]:
voter_turnout_df['VEP Turnout Rate'] = voter_turnout_df['VEP Turnout Rate'].str.rstrip('%').astype('float') / 100.0

In [None]:
df = voter_turnout_df

df['text'] = df.apply(lambda x: x['State'] +'<br>' + 'Turnout Rate: ' + "{:.1%}".format(x['VEP Turnout Rate']) + 
                      '<br>'+ 'Total Ballots Counted: '+x['Total Ballots Counted (Estimate)'] + 
                      '<br>'+ 'Voting-Eligible Population: '+x['Voting-Eligible Population (VEP)'],axis=1)

fig2 = go.Figure(data=go.Choropleth(
    locations=df['State Abv'],
    z=df['VEP Turnout Rate'],
    locationmode='USA-states',
    colorscale='Greens',
    autocolorscale=False,
    text=df['text'], # hover text
    marker_line_color='white', # line markers between states
    colorbar_title="Voter Turnout Rates",
    hoverinfo="text"
))

fig2.update_layout(
    title_text='',
    geo = dict(
        scope='usa',
        projection=go.layout.geo.Projection(type = 'albers usa'),
        showlakes=True, # lakes
        lakecolor='rgb(255, 255, 255)')
)


fig2.show()

In [None]:
top_10_voter_turnout = voter_turnout_df.sort_values(by='VEP Turnout Rate', ascending=False)[['State','VEP Turnout Rate' ]].head(10)
top_10_voter_turnout

In [None]:
bottom_10_voter_turnout = voter_turnout_df.sort_values(by='VEP Turnout Rate', ascending=False)[['State','VEP Turnout Rate' ]].tail(10)
bottom_10_voter_turnout