# Web Scraping Data for Raw Votes by County

Votes for elections by county are now easy to come by in a comprehensive format. This notebook scrapes that data from the site townhall.com for the past four presidential elections and puts them into separates dataframes.
Votes for Alaska aren't included because they don't vote by county and their voting districts are not labeled well for comparing to economic estimators. 

In [69]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import time

In [70]:
pd.set_option("display.max_rows", 4000)

In [71]:
url = 'https://townhall.com/election/{}/president/{}/county'

In [72]:
# set list of states to interate over all counties in all states, missing alaska
states = ['al','az','ar','ca','co','ct','de','fl','ga','hi','id','il','in','ia','ks',
          'ky','la','me','md','ma','mi','mn','ms','mo','mt','ne','nv','nh','nj','nm','ny',
          'nc','nd','oh','ok','or','pa','ri','sc','sd','tn','tx','ut','vt','va','wa','wv',
          'wi','wy']

# set url of site to scrap from
url = 'https://townhall.com/election/{}/president/{}/county'

# lists of candidates for each year, to match election year entered into function
candidates_2004 = ['George Bush', 'John Kerry', 'Ralph Nader']
candidates_2008 = ['Barack Obama', 'John McCain']
candidates_2012 = ['Barack Obama', 'Mitt Romney', 'Gary Johnson', 'Other']
candidates_2016 = ['Hillary Clinton', 'Donald Trump', 'Gary Johnson', 'Jill Stein']


def data(year, candidates):
    '''Function that scrapes the site townhall.com for election data by county. Loops over 
    each state's page and outputs data for election year entered.'''
    
    # initialize empty lists
    rep_vote = []
    dem_vote = []
    counties = []
    
    # scrapes each state's page using list of states from above
    for state in states:
        res = requests.get(url .format(year, state))
        soup = BeautifulSoup(res.content, 'lxml')
        
        
        rows = soup.find_all('tbody')
        
        if len(rows) == 2:
            rows = soup.find_all('tbody')[1]
        else:
            rows = soup.find_all('tbody')

        # loops over rows to pull out county names
        for row in rows:
            count = []
            x = row.find('div')
            # filters returns from div returns three types of values (the county name, -1 and None) 
            # then appends to empty list to create list of lists, which will later be flattend
            if type(x) != int and x != None:
                county = x.text
                count.append(county)
                count = list(set(count))
                count.sort()
                count = [str(x + ', ' + state.upper()) for x in count]
                counties.append(count)

        # Republican Results - finds data from text with 'republican' class
        # finds class of REP and GOP, different for different years
        
        
        row = rows.find_all('td', {'class':  ['REP', 'GOP']})

        for x in row:
            if '.' not in x.text and x.text not in candidates:
                rep_vote.append(x.text.replace(',', ''))
                
       # Democratic Results - finds data from text with 'democratic' class

        row = rows.find_all('td', {'class': 'DEM'})
        

        for x in row:
            if '.' not in x.text and x.text not in candidates:
                dem_vote.append(x.text.replace(',', ''))
                
            
    return counties, dem_vote, rep_vote

## 2004 Election

In [97]:
count, dem_vote, rep_vote = data(2004, candidates_2004)

# check to make sure lists are the same length
print(len(count), len(dem_vote), len(rep_vote))

# loop over counties list to convert from list of lists or strings, to list of strings
counties = [item for sublist in count for item in sublist]

# empty DataFrame
results_2004 = pd.DataFrame()
        
# add lists to empty Data Frame
results_2004['county'] = counties
results_2004['dem_vote'] = dem_vote
results_2004['rep_vote'] = rep_vote
results_2004['year'] = 2004

results_2004

3110 3110 3110


Unnamed: 0,county,dem_vote,rep_vote,year
0,"Autauga, AL",4774,15212,2004
1,"Baldwin, AL",15579,52910,2004
2,"Barbour, AL",4826,5893,2004
3,"Bibb, AL",2089,5471,2004
4,"Blount, AL",3932,17364,2004
5,"Bullock, AL",3210,1494,2004
6,"Butler, AL",3409,4978,2004
7,"Calhoun, AL",15076,29806,2004
8,"Chambers, AL",5346,7618,2004
9,"Cherokee, AL",3036,5919,2004


## 2008 Election

In [98]:
count, dem_vote, rep_vote = data(2008, candidates_2008)

# check to make sure lists are the same length
print(len(count), len(dem_vote), len(rep_vote))

# loop over counties list to convert from list of lists or strings, to list of strings
counties = [item for sublist in count for item in sublist]

# empty DataFrame
results_2008 = pd.DataFrame()

# add lists to empty Data Frame
results_2008['county'] = counties
results_2008['dem_vote'] = dem_vote
results_2008['rep_vote'] = rep_vote
results_2008['year'] = 2008

results_2008

3112 3112 3112


Unnamed: 0,county,dem_vote,rep_vote,year
0,"Autauga, AL",6091,17398,2008
1,"Baldwin, AL",19362,61192,2008
2,"Barbour, AL",5685,5862,2008
3,"Bibb, AL",2289,6247,2008
4,"Blount, AL",3518,20362,2008
5,"Bullock, AL",4001,1389,2008
6,"Butler, AL",4174,5472,2008
7,"Calhoun, AL",16325,32326,2008
8,"Chambers, AL",6782,8060,2008
9,"Cherokee, AL",2299,7285,2008


## 2012 Election

In [99]:
count, dem_vote, rep_vote = data(2012, candidates_2012)

# check to make sure lists are the same length
print(len(count), len(dem_vote), len(rep_vote))

# loop over counties list to convert from list of lists or strings, to list of strings
counties = [item for sublist in count for item in sublist]

# due to faulty data entry in townhall.com Hawaii's county Kalawao needs to be removed
# as well as the entries corresponding to Kauai, entered as '-', which offset data
del counties[518] 
del rep_vote[518]
del dem_vote[518]

rep_vote.remove('-') 
dem_vote.remove('-')

# empty DataFrame
results_2012 = pd.DataFrame()

# add lists to empty Data Frame
results_2012['county'] = counties
results_2012['dem_vote'] = dem_vote
results_2012['rep_vote'] = rep_vote
results_2012['year'] = 2012

results_2012

3113 3114 3114


Unnamed: 0,county,dem_vote,rep_vote,year
0,"Autauga, AL",6354,17366,2012
1,"Baldwin, AL",18329,65772,2012
2,"Barbour, AL",5873,5539,2012
3,"Bibb, AL",2200,6131,2012
4,"Blount, AL",2961,20741,2012
5,"Bullock, AL",4058,1250,2012
6,"Butler, AL",4367,5081,2012
7,"Calhoun, AL",15500,30272,2012
8,"Chambers, AL",6853,7596,2012
9,"Cherokee, AL",2126,7494,2012


## 2016 Election

In [100]:
count, dem_vote, rep_vote = data(2016, candidates_2016)

# check to make sure lists are the same length
print(len(count), len(dem_vote), len(rep_vote))

# loop over counties list to convert from list of lists or strings, to list of strings
counties = [item for sublist in count for item in sublist]

# empty DataFrame
results_2016 = pd.DataFrame()

# add lists to empty Data Frame
results_2016['county'] = counties
results_2016['dem_vote'] = dem_vote
results_2016['rep_vote'] = rep_vote
results_2016['year'] = 2016

results_2016

3111 3111 3111


Unnamed: 0,county,dem_vote,rep_vote,year
0,"Autauga, AL",5908,18110,2016
1,"Baldwin, AL",18409,72780,2016
2,"Barbour, AL",4848,5431,2016
3,"Bibb, AL",1874,6733,2016
4,"Blount, AL",2150,22808,2016
5,"Bullock, AL",3530,1139,2016
6,"Butler, AL",3716,4891,2016
7,"Calhoun, AL",13197,32803,2016
8,"Chambers, AL",5763,7803,2016
9,"Cherokee, AL",1524,8809,2016


In [101]:
results_2004.shape, results_2008.shape, results_2012.shape, results_2016.shape

((3110, 4), (3112, 4), (3112, 4), (3111, 4))

In [102]:
results_2016

Unnamed: 0,county,dem_vote,rep_vote,year
0,"Autauga, AL",5908,18110,2016
1,"Baldwin, AL",18409,72780,2016
2,"Barbour, AL",4848,5431,2016
3,"Bibb, AL",1874,6733,2016
4,"Blount, AL",2150,22808,2016
5,"Bullock, AL",3530,1139,2016
6,"Butler, AL",3716,4891,2016
7,"Calhoun, AL",13197,32803,2016
8,"Chambers, AL",5763,7803,2016
9,"Cherokee, AL",1524,8809,2016


In [103]:
for x in results_2004.county:
    if 'Co.' in x:
        print(x)

St. Louis Co., MO
Bedford Co., VA
Fairfax Co., VA
Franklin Co., VA
Richmond Co., VA
Roanoke Co., VA


In [111]:
results_2004[results_2004.county == 'Richmond Co., VA']

Unnamed: 0,county,dem_vote,rep_vote,year
2893,"Richmond Co., VA",1243,2082,2004


In [116]:
results_2004.loc[2892, 'county'] = 'Richmond Co., VA'
results_2004.loc[2893, 'county'] = 'Richmond, VA'
results_2008.loc[2894, 'county'] = 'Richmond Co., VA'
results_2008.loc[2895, 'county'] = 'Richmond, VA'
results_2012.loc[2894, 'county'] = 'Richmond Co., VA'
results_2012.loc[2895, 'county'] = 'Richmond, VA'
results_2016.loc[2893, 'county'] = 'Richmond Co., VA'
results_2016.loc[2894, 'county'] = 'Richmond, VA'


results_2004.loc[1553, 'county'] = 'St. Louis Co., MO'
results_2004.loc[1554, 'county'] = 'St. Louis, MO'
results_2008.loc[1553, 'county'] = 'St. Louis Co., MO'
results_2008.loc[1554, 'county'] = 'St. Louis, MO'
results_2012.loc[1553, 'county'] = 'St. Louis Co., MO'
results_2012.loc[1554, 'county'] = 'St. Louis, MO'
results_2016.loc[1553, 'county'] = 'St. Louis Co., MO'
results_2016.loc[1554, 'county'] = 'St. Louis, MO'


results_2004.loc[2797, 'county'] = 'Bedford Co., VA'
results_2004.loc[2798, 'county'] = 'Bedford, VA'
results_2008.loc[2799, 'county'] = 'Bedford Co., VA'
results_2008.loc[2800, 'county'] = 'Bedford, VA'
results_2012.loc[2799, 'county'] = 'Bedford Co., VA'
results_2012.loc[2800, 'county'] = 'Bedford, VA'
results_2016.loc[2799, 'county'] = 'Bedford, VA'


results_2004.loc[2825, 'county'] = 'Fairfax Co., VA'
results_2004.loc[2826, 'county'] = 'Fairfax, VA'
results_2008.loc[2827, 'county'] = 'Fairfax Co., VA'
results_2008.loc[2828, 'county'] = 'Fairfax, VA'
results_2012.loc[2827, 'county'] = 'Fairfax Co., VA'
results_2012.loc[2828, 'county'] = 'Fairfax, VA'
results_2016.loc[2826, 'county'] = 'Fairfax Co., VA'
results_2016.loc[2827, 'county'] = 'Fairfax, VA'


results_2004.loc[2894, 'county'] = 'Roanoke Co., VA'
results_2004.loc[2895, 'county'] = 'Roanoke, VA'
results_2008.loc[2896, 'county'] = 'Roanoke Co., VA'
results_2008.loc[2896, 'county'] = 'Roanoke, VA'
results_2012.loc[2896, 'county'] = 'Roanoke Co., VA'
results_2012.loc[2897, 'county'] = 'Roanoke, VA'
results_2016.loc[2895, 'county'] = 'Roanoke Co., VA'
results_2016.loc[2896, 'county'] = 'Roanoke, VA'


results_2004.loc[2831, 'county'] = 'Franklin Co., VA'
results_2004.loc[2832, 'county'] = 'Franklin, VA'
results_2008.loc[2833, 'county'] = 'Franklin Co., VA'
results_2008.loc[2834, 'county'] = 'Franklin, VA'
results_2012.loc[2833, 'county'] = 'Franklin Co., VA'
results_2012.loc[2834, 'county'] = 'Franklin, VA'
results_2016.loc[2832, 'county'] = 'Franklin Co., VA'
results_2016.loc[2833, 'county'] = 'Franklin, VA'

In [117]:
len(results_2004.county.unique()), results_2004.shape

(3110, (3110, 4))

In [118]:
len(results_2008.county.unique()), results_2008.shape

(3112, (3112, 4))

In [119]:
len(results_2012.county.unique()), results_2012.shape

(3112, (3112, 4))

In [120]:
len(results_2016.county.unique()), results_2016.shape

(3111, (3111, 4))

In [121]:
# puts dataframe in to .csv files in same folder as notebook

results_2004.to_csv('./csv_files/results_2004', index=False)

results_2008.to_csv('./csv_files/results_2008', index=False)

results_2012.to_csv('./csv_files/results_2012', index=False)

results_2016.to_csv('./csv_files/results_2016', index=False)

Note: inspiration for this webscrape came from existing code by tonmcg repository County_Level_Election_Results_12-16. This code only worked for the 2012 and 2016 elections. It also returned a dataframe that wasn't useful to me. So I wrote my own code, as shwon above.
https://github.com/tonmcg/County_Level_Election_Results_12-16