# Context
This notebook scrapes election results from https://uselectionatlas.org/. It was pretty difficult to find comprehensive county-level election data for elections that were in the 20th century otherwise.

We proceed with setup and imports

In [None]:
#Imports
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from tqdm import tqdm
import pyarrow.parquet as pq
import pyarrow as pa

In [None]:
#These dicts make it easier later on to map between URLs and states while webscraping
state_abbreviation_to_name = {
    "AL": "Alabama", "AK": "Alaska", "AZ": "Arizona", "AR": "Arkansas",
    "CA": "California", "CO": "Colorado", "CT": "Connecticut", "DE": "Delaware",
    "FL": "Florida", "GA": "Georgia", "HI": "Hawaii", "ID": "Idaho",
    "IL": "Illinois", "IN": "Indiana", "IA": "Iowa", "KS": "Kansas",
    "KY": "Kentucky", "LA": "Louisiana", "ME": "Maine", "MD": "Maryland",
    "MA": "Massachusetts", "MI": "Michigan", "MN": "Minnesota", "MS": "Mississippi",
    "MO": "Missouri", "MT": "Montana", "NE": "Nebraska", "NV": "Nevada",
    "NH": "New Hampshire", "NJ": "New Jersey", "NM": "New Mexico", "NY": "New York",
    "NC": "North Carolina", "ND": "North Dakota", "OH": "Ohio", "OK": "Oklahoma",
    "OR": "Oregon", "PA": "Pennsylvania", "RI": "Rhode Island", "SC": "South Carolina",
    "SD": "South Dakota", "TN": "Tennessee", "TX": "Texas", "UT": "Utah",
    "VT": "Vermont", "VA": "Virginia", "WA": "Washington", "WV": "West Virginia",
    "WI": "Wisconsin", "WY": "Wyoming"
}

state_name_to_abbreviation = {v: k for k, v in state_abbreviation_to_name.items()}

# Getting FIPS Code
This is basically a unique identifying code that the website uses to correspond with every state.

In [None]:
'''Website uses a FIPS code to correspond to every state. So this code just iterates through the possible FIPS codes to map it to corresponding states'''

fips_dict = {}
for i in range(0, 57):
  url = f"https://uselectionatlas.org/RESULTS/datagraph.php?year=1972&fips={i}&f=1&off=0&elect=0"
  response = requests.get(url)
  soup = BeautifulSoup(response.text, "html.parser")
  body_content = soup.body.find(["div", "b"])

  # Check if the content is available and contains the state information
  if body_content:
      # Extract text and split by <br> tags
      lines = body_content.get_text().splitlines()

      # Look for the line with "Presidential General Election Data Graphs -"
      for line in lines:
          if "by County" in line:
              # Find the index of 'by County' and extract everything before it
              state_end_index = line.index("by County")
              state_name = line[:state_end_index].split("-")[-1].strip()
              fips_dict[i] = state_name
              break
  else:
      print("State name not found.")

'''Want reverse dict as well'''
fips_dict_rev = {v:k for k, v in fips_dict.items()}

In [None]:
# These states are not included in our political groundtruths results due to not having county-level election results
for state in state_name_to_abbreviation.keys():
  if state not in fips_dict_rev:
    print(state)

Alaska
Louisiana


# Getting County-level election results

In [None]:
#Gathering county election results
county_results = []

#Iterating over year range that overlaps with newswire dataset
for year in tqdm(range(1876, 1980, 4), desc="Years"):
  #Iterating over US states
  for state in tqdm(state_name_to_abbreviation.keys(), desc="States", leave=False):
    #If we know the state is in the fips dict, then we can try and request that URL
    if state in fips_dict_rev:
      url = f"https://uselectionatlas.org/RESULTS/datagraph.php?year={year}&fips={fips_dict_rev[state]}&f=1&off=0&elect=0"

      response = requests.get(url)

      # Check if the request was successful
      if response.status_code == 200:
          # Parse the page content with BeautifulSoup
          soup = BeautifulSoup(response.content, 'html.parser')

          data_rows = soup.find_all('tr')
          current_county = None
          county_candidates = []

          #Now we iterate over the counties for a given state
          for row in data_rows:
              cells = row.find_all('td')
              cell_values = [cell.get_text(strip=True) for cell in cells]
              if len(cell_values) == 4:
                  if current_county and county_candidates:
                    # Find the candidate with the highest percentage in the county
                    max_candidate = max(county_candidates, key=lambda x: float(x[1].strip('%')))
                    county_results.append({
                        "State": state,
                        "State Abbreviation": state_name_to_abbreviation[state],
                        "Year": year,
                        "County": current_county,
                        "Top Candidate": max_candidate[0],
                        "Vote Percentage": max_candidate[1]
                    })

                  # Update the current county and reset candidates list
                  current_county = cell_values[0]
                  county_candidates = [[cell_values[1], cell_values[2]]]
              elif len(cell_values) == 3:
                  # Add candidate info for the current county
                  county_candidates.append([cell_values[0], cell_values[1]])
              else:
                continue
      else:
          print(f"Failed to retrieve the page. Status code: {response.status_code}")

      #Getting the candidate with the most votes in that particular county
      if current_county and county_candidates:
          max_candidate = max(county_candidates, key=lambda x: float(x[1].strip('%')))
          county_results.append({
              "State": state,
              "State Abbreviation": state_name_to_abbreviation[state],
              "Year": year,
              "County": current_county,
              "Top Candidate": max_candidate[0],
              "Vote Percentage": max_candidate[1]
          })


In [None]:
df = pd.DataFrame(county_results)
df.head()

Unnamed: 0,State,State Abbreviation,Year,County,Top Candidate,Vote Percentage
0,Alabama,AL,1960,Autauga,Kennedy,52.2%
1,Alabama,AL,1960,Baldwin,Kennedy,53.1%
2,Alabama,AL,1960,Barbour,Kennedy,64.5%
3,Alabama,AL,1960,Bibb,Kennedy,61.7%
4,Alabama,AL,1960,Blount,Kennedy,57.0%


In [None]:
#Unfortunately, county level results were only available from years 1960 onwards. Hence why we focus our analysis mostly on the decade of 1960
df.Year.value_counts()

Unnamed: 0_level_0,count
Year,Unnamed: 1_level_1
1976,3048
1968,3046
1972,3045
1964,3042
1960,3041


In [None]:
#Saving the election results
df.to_parquet("county_election_results.parquet")

# Getting US-level Presidential Election Results
We have to query this separately because the county-level results just state the name of the candidate and not the political party.

In [None]:
#Now, we get the information for each presidential candidate and link it to a political party

# Initialize a list to store candidate data
candidates_data = []

# Define the URL -- iterating over years
for year in range(1876, 1980, 4):
  url = f"https://uselectionatlas.org/RESULTS/national.php?f=1&off=0&year={year}"

  # Send a GET request to the URL
  response = requests.get(url)
  response.raise_for_status()

  # Parse the page content with BeautifulSoup
  soup = BeautifulSoup(response.content, 'html.parser')

  # Find all rows in the table
  data_rows = soup.find_all('tr')

  # Only process rows 4 to 8 (using 0-based index)
  for row in data_rows[3:]:  # Rows 4 to 8 in 1-based indexing
      cells = [cell.get_text(strip=True) for cell in row.find_all('td')]

      if cells[1] == 'Other (+)':
        break
      # Extract the relevant data
      if len(cells) >= 6:  # Ensure there are enough cells
          candidate_name = cells[1]  # Candidate's name
          running_mate = cells[2]     # Running mate's name
          party_affiliation = cells[3]
          votes = cells[4]
          vote_percentage = cells[5]

          # Append data to the list
          candidates_data.append({
              "Candidate": candidate_name,
              "Running Mate": running_mate,
              "Party": party_affiliation,
              "Votes": votes,
              "Vote Percentage": vote_percentage,
              "Year": year
          })

# Create a DataFrame from the candidate data
df = pd.DataFrame(candidates_data)

# Display the DataFrame
print(df)


           Candidate      Running Mate        Party       Votes  \
0   Rutherford Hayes   William Wheeler   Republican   4,034,142   
1      Samuel Tilden  Thomas Hendricks   Democratic   4,286,808   
2       Peter Cooper       Samuel Cary    Greenback      83,726   
3     James Garfield    Chester Arthur   Republican   4,453,337   
4   Winfield Hancock   William English   Democratic   4,444,267   
..               ...               ...          ...         ...   
93      John Hospers   Theodora Nathan  Libertarian       3,674   
94      John Schmitz   Thomas Anderson     American   1,100,896   
95      James Carter    Walter Mondale   Democratic  40,831,881   
96       Gerald Ford       Robert Dole   Republican  39,148,634   
97   Eugene McCarthy                 -  Independent     744,763   

   Vote Percentage  Year  
0           47.92%  1876  
1           50.92%  1876  
2            0.99%  1876  
3           48.31%  1880  
4           48.22%  1880  
..             ...   ...  
93    

In [None]:
# Save the DataFrame as a Parquet file
table = pa.Table.from_pandas(df)
pq.write_table(table, 'pres_candidate_election_results.parquet')