### 2016 - 2020 Presidential Election Comparison

In [2]:
# Imports
import os
import pandas as pd

In [9]:
def print_df_overview(df, title):        
    print(f"{title} DF Head:")
    print(df.head())
    print("----------------------------------")
    print(f"{title} DF dtypes:")
    print(df.dtypes)
    
    for col in df.columns:
        print("----------------------------------")
        print(f"{title} DF column value counts:")
        # Some of these are very like, like student names, but it's worth seeing at
        # least the beginning and end to see things like students with the same names
        ABRIDGED_ROWS = 16
        counts = df[col].value_counts()
        if counts.size < 2*ABRIDGED_ROWS:
            # Print the counts all together
            print(col, "counts:")    
            print(counts)
        else:
            # Print top and bottom counts
            print(col, "top counts:")
            print(counts[0:ABRIDGED_ROWS])
            print(col, "bottom counts:")
            print(counts[-ABRIDGED_ROWS:])

In [3]:
# Read raw data -- takes several seconds
path_2016 = os.path.join('.', 'resources', '2016', '2016-precinct-president.csv')
path_2020 = os.path.join('.', 'resources', '2020', 'PRESIDENT_precinct_general.csv')

# Encoding wasn't documented but ISO-8859-1 seems to work fine. 
# Some data types specified here to suppress warnings -- they're actually provided for 2020
raw_2016_df = pd.read_csv(path_2016, encoding="ISO-8859-1", 
                          dtype={"precinct": str, "district": str, "party": str, "candidate_fec": str, "candidate_fec_name": str} )

official_2020_dtypes = {'precinct':str,'office':str, 'party_detailed':str, 
		'party_simplified':str,'mode':str,'votes':int, 'county_name':str,
		'county_fips':str, 'jurisdiction_name':str,'jurisdiction_fips':str,
		'candidate':str, 'district':str, 'dataverse':str,'year':int,
		'stage':str, 'state':str, 'special':str, 'writein':str, 'state_po':str,
		'state_fips':str, 'state_cen':str, 'state_ic':str, 'date':str, 
		'readme_check':str,'magnitude':int}
raw_2020_df = pd.read_csv(path_2020, encoding="ISO-8859-1", dtype=official_2020_dtypes )

raw_2016_df.shape, raw_2020_df.shape

((1989234, 37), (1982581, 25))

In [11]:
# These don't impact flow or anything like that, just print some info about the DFs to inform cleaning
#print_df_overview(raw_2016_df, "2016 Raw")
#print_df_overview(raw_2020_df, "2020 Raw")

2020 DF Head:
                    precinct        office            party_detailed  \
0  061110097162_003024019037  US PRESIDENT                  ALLIANCE   
1  061110097162_003024019037  US PRESIDENT                  DEMOCRAT   
2  061110097162_003024019037  US PRESIDENT                     GREEN   
3  061110097162_003024019037  US PRESIDENT               LIBERTARIAN   
4  061110097162_003024019037  US PRESIDENT  SOCIALISM AND LIBERATION   

  party_simplified      mode  votes county_name county_fips jurisdiction_name  \
0            OTHER  ABSENTEE      0     VENTURA        6111           VENTURA   
1         DEMOCRAT  ABSENTEE      1     VENTURA        6111           VENTURA   
2            OTHER  ABSENTEE      0     VENTURA        6111           VENTURA   
3      LIBERTARIAN  ABSENTEE      0     VENTURA        6111           VENTURA   
4            OTHER  ABSENTEE      0     VENTURA        6111           VENTURA   

  jurisdiction_fips  ...       state special writein  state_po sta

In [12]:
# get ready to clean up
c_2016_df = raw_2016_df.copy()
c_2020_df = raw_2020_df.copy()
print("Initial:                 ", c_2016_df.shape, c_2020_df.shape)

# Results include "statistial adjustments" which I'm ignoring for the moment
c_2016_df = c_2016_df.drop(c_2016_df[c_2016_df.precinct == "Statistical Adjustments"].index)
c_2020_df = c_2020_df.drop(c_2020_df[c_2020_df.jurisdiction_name == "{STATISTICAL ADJUSTMENTS}"].index)
print("Drop adjustments:        ", c_2016_df.shape, c_2020_df.shape)

# Also remove any negative votes -- I don't know what they mean
c_2016_df = c_2016_df.drop(c_2016_df[c_2016_df.votes <0].index)
c_2020_df = c_2020_df.drop(c_2020_df[c_2020_df.votes <0].index)
print("Drop negative votes:     ", c_2016_df.shape, c_2020_df.shape)

# Narrow down to the interesting columns
c_2016_df = c_2016_df.loc[:, ["state", "county_name", "jurisdiction", "candidate", "votes"] ]
c_2020_df = c_2020_df.loc[:, ["state", "county_name", "jurisdiction_name", "candidate", "votes"] ]
print("Drop extraneous columns: ", c_2016_df.shape, c_2020_df.shape)

print("2016:")
print(f"Original: {raw_2016_df.shape[0]:7} rows, {raw_2016_df['votes'].sum():10} votes")
print(f"Cleaned:  {c_2016_df.shape[0]:7} rows, {c_2016_df['votes'].sum():10} votes")

print("2020:")
print(f"Original: {raw_2020_df.shape[0]:7} rows, {raw_2020_df['votes'].sum():10} votes")
print(f"Cleaned:  {c_2020_df.shape[0]:7} rows, {c_2020_df['votes'].sum():10} votes")


Initial:                  (1989234, 37) (1982581, 25)
Drop adjustments:         (1988506, 37) (1982207, 25)
Drop negative votes:      (1988502, 37) (1974972, 25)
Drop extraneous columns:  (1988502, 5) (1974972, 5)
2016:
Original: 1989234 rows,  140070880 votes
Cleaned:  1988502 rows,  140064901 votes
2020:
Original: 1982581 rows,  157743486 votes
Cleaned:  1974972 rows,  157753369 votes


In [14]:
# These don't impact flow or anything like that, just print some info about the DFs to inform cleaning
print_df_overview(c_2016_df, "2016 Cleaned")
print_df_overview(c_2020_df, "2020 Cleaned")

2016 Cleaned DF Head:
     state     county_name jurisdiction        candidate  votes
0  Alabama  Autauga County      Autauga  Hillary Clinton    135
1  Alabama  Autauga County      Autauga     Gary Johnson      0
2  Alabama  Autauga County      Autauga       Jill Stein      1
3  Alabama  Autauga County      Autauga     Donald Trump    218
4  Alabama  Autauga County      Autauga       [Write-in]      4
----------------------------------
2016 Cleaned DF dtypes:
state           object
county_name     object
jurisdiction    object
candidate       object
votes            int64
dtype: object
----------------------------------
2016 Cleaned DF column value counts:
state top counts:
state
California        301194
New York          208096
Maryland          119860
Arkansas           89170
Colorado           84284
New Jersey         70930
Kansas             69935
Michigan           65273
Wisconsin          61812
North Carolina     60707
Louisiana          52416
Washington         49921
Illinois  