# US presidential election results by county: 2000-2020
> This notebook reads and processes county-level results collected by [MIT's election lab](https://electionlab.mit.edu/data) and merges it with geography files.

---

#### Import Python tools and Jupyter config

In [202]:
import us
import pandas as pd
import jupyter_black
import geopandas as gpd

In [203]:
jupyter_black.load()
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100
pd.options.display.max_colwidth = None

---

## Fetch

#### Read local data

In [204]:
# Harvard/MIT: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/VOQCHQ
# Metadata: 'raw/mit/County Presidential Returns 2000-2020.md'
counties_src = (
    pd.read_csv(
        "data/raw/mit/countypres_2000-2020.csv",
        dtype={"county_fips": str, "year": str, "version": str},
    )
    # just major parties
    .query(
        'mode == "TOTAL" and party.str.contains("DEMOCRAT|REPUBLICAN") and totalvotes>0'
    )
    # only county-level geographies (not federal precincts, overseas votes, etc.)
    .dropna(subset="county_fips")
)

---

## Process

#### Clean dates, standardize categories, etc. 

In [205]:
counties_src["county_fips"] = counties_src["county_fips"].str.zfill(5)

#### Wide format so parties are columns

In [206]:
counties_pivot = counties_src.pivot(
    columns=["party"],
    values=["candidatevotes", "totalvotes"],
    index=["county_fips", "county_name", "state_po", "year"],
).reset_index()

#### That creates a multiindex. Flatten it. 

In [207]:
counties_pivot.columns = [
    "_".join(filter(None, col))
    .strip()
    .lower()
    .replace("candidate", "")
    .replace("totalvotes_democrat", "votes_all")
    .replace("ocrat", "")
    .replace("ublican", "")
    for col in counties_pivot.columns
]

#### Clean dataframe

In [254]:
counties_df = counties_pivot.drop(["totalvotes_rep"], axis=1).copy()

#### Share for each party

In [255]:
counties_df["dem_pct"] = round(counties_df["votes_dem"] / counties_df["votes_all"], 2)
counties_df["rep_pct"] = round(counties_df["votes_rep"] / counties_df["votes_all"], 2)

#### One place: Dallas County, Texas

In [256]:
counties_df.query('county_name == "DALLAS" and state_po == "TX"')

Unnamed: 0,county_fips,county_name,state_po,year,votes_dem,votes_rep,votes_all,dem_pct,rep_pct
14825,48113,DALLAS,TX,2000,275308,322345,613039,0.45,0.53
14826,48113,DALLAS,TX,2004,336641,346246,687709,0.49,0.5
14827,48113,DALLAS,TX,2008,422989,310000,738463,0.57,0.42
14828,48113,DALLAS,TX,2012,405571,295813,710117,0.57,0.42
14829,48113,DALLAS,TX,2016,461080,262945,758973,0.61,0.35
14830,48113,DALLAS,TX,2020,598576,307076,919504,0.65,0.33


---

## Aggregate

#### Calculate the winner for each row

In [257]:
# Function to name winner
def calculate_winner(row):
    if row["dem_pct"] > row["rep_pct"]:
        return "dem"
    elif row["dem_pct"] < row["rep_pct"]:
        return "rep"
    else:
        return "tie"


counties_df["winner"] = counties_df.apply(calculate_winner, axis=1)

#### Change from 2000 to 2020 by place

In [258]:
# Convert 'year' and 'county_fips' to appropriate data types
counties_df["year"] = counties_df["year"].astype(int)
counties_df["county_fips"] = counties_df["county_fips"].astype(str)


# Function to calculate the percentage point difference from 2000 to 2020 for each county
def calculate_ppt_diff(df):
    # Get the data for 2000 and 2020
    pct_2000 = df[df["year"] == 2000]
    pct_2020 = df[df["year"] == 2020]

    # Check if both years have data
    if not pct_2000.empty and not pct_2020.empty:
        dem_diff = pct_2020["dem_pct"].values[0] - pct_2000["dem_pct"].values[0]
        rep_diff = pct_2020["rep_pct"].values[0] - pct_2000["rep_pct"].values[0]
        return dem_diff, rep_diff
    else:
        return None, None


# Initialize a list to store the results
results = []

# Group by county_fips, county_name, and state_po and calculate the differences
for (county_fips, county_name, state_po), group in counties_df.groupby(
    ["county_fips", "county_name", "state_po"]
):
    dem_diff, rep_diff = calculate_ppt_diff(group)
    results.append(
        {
            "county_fips": county_fips,
            "county_name": county_name,
            "state_po": state_po,
            "dem_pct_diff": dem_diff,
            "rep_pct_diff": rep_diff,
        }
    )

# Create the new DataFrame
counties_change_df = pd.DataFrame(results)

In [261]:
# Convert 'year' and 'county_fips' to appropriate data types
counties_df["year"] = counties_df["year"].astype(int)
counties_df["county_fips"] = counties_df["county_fips"].astype(str)


# Function to calculate the percentage point difference from 2000 to 2020 for each county
def calculate_ppt_diff(df):
    # Get the data for 2000 and 2020
    pct_2000 = df[df["year"] == 2000]
    pct_2020 = df[df["year"] == 2020]

    # Check if both years have data
    if not pct_2000.empty and not pct_2020.empty:
        dem_2000 = pct_2000["dem_pct"].values[0]
        rep_2000 = pct_2000["rep_pct"].values[0]
        dem_2020 = pct_2020["dem_pct"].values[0]
        rep_2020 = pct_2020["rep_pct"].values[0]

        dem_diff = dem_2020 - dem_2000
        rep_diff = rep_2020 - rep_2000

        return dem_2000, rep_2000, dem_2020, rep_2020, dem_diff, rep_diff
    else:
        return None, None, None, None, None, None


# Initialize a list to store the results
results = []

# Group by county_fips, county_name, and state_po and calculate the differences
for (county_fips, county_name, state_po), group in counties_df.groupby(
    ["county_fips", "county_name", "state_po"]
):
    dem_2000, rep_2000, dem_2020, rep_2020, dem_diff, rep_diff = calculate_ppt_diff(
        group
    )
    results.append(
        {
            "county_fips": county_fips,
            "county_name": county_name,
            "state_po": state_po,
            "dem_pct_2000": dem_2000,
            "rep_pct_2000": rep_2000,
            "dem_pct_2020": dem_2020,
            "rep_pct_2020": rep_2020,
            "dem_pct_diff": dem_diff,
            "rep_pct_diff": rep_diff,
        }
    )

# Create the new DataFrame
counties_change_df = pd.DataFrame(results)

In [262]:
counties_change_df

Unnamed: 0,county_fips,county_name,state_po,dem_pct_2000,rep_pct_2000,dem_pct_2020,rep_pct_2020,dem_pct_diff,rep_pct_diff
0,01001,AUTAUGA,AL,0.29,0.70,0.27,0.71,-0.02,0.01
1,01003,BALDWIN,AL,0.25,0.72,0.22,0.76,-0.03,0.04
2,01005,BARBOUR,AL,0.50,0.49,0.46,0.53,-0.04,0.04
3,01007,BIBB,AL,0.38,0.60,0.21,0.78,-0.17,0.18
4,01009,BLOUNT,AL,0.28,0.70,0.10,0.90,-0.18,0.20
...,...,...,...,...,...,...,...,...,...
3151,56037,SWEETWATER,WY,0.35,0.60,0.23,0.73,-0.12,0.13
3152,56039,TETON,WY,0.39,0.52,0.67,0.29,0.28,-0.23
3153,56041,UINTA,WY,0.22,0.74,0.17,0.79,-0.05,0.05
3154,56043,WASHAKIE,WY,0.20,0.77,0.16,0.80,-0.04,0.03


---

## Metadata

#### Codebook for 2000-2020 County Presidential Data

The data file `countypres_2000-2020` contains county-level returns for presidential elections from 2000 to 2020. The data source is official state election data records. Note: County results in Alaska for 2004 are based on official Alaska data, but it is clear the district returns significantly overstate the number of votes cast. In Alaska, the county_fips field stores a combination of state FIPS code and district.

#### Variables
The variables are listed as they appear in the data file. 

`year` : Election year

`state ` : State name 

`state_po` : U.S. postal code state abbreviation

`county_name` : County name

`county_fips` : County FIPS code

`office` : Office (e.g., President)

`candidate` : Name of the candidate

`party` : Party of the candidate (e.g., DEMOCRAT, REPUBLICAN, GREEN, LIBERTARIAN, OTHER)

`candidatevotes ` : Votes received by this candidate for this particular party

`totalvotes` : Total number of votes cast in this county-year

`mode` : Mode of ballots cast (default is TOTAL, with different modes specified for 2020)

`version` : Date when the dataset was finalized

---

## Exports

#### XyXy subset in CSV format to `processed`

#### JSON, GeoJSON, etc., to `processed`