In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

In [2]:
pd.set_option("display.max_rows", 4000)

In [3]:
url = 'https://townhall.com/election/{}/president/{}/county'
res = requests.get(url .format(2008, 'al'))
soup = BeautifulSoup(res.content, 'lxml')

In [4]:
rows = soup.find_all('tbody')[1]

In [5]:
results = pd.DataFrame()

In [6]:
# set list of states to interate over all counties in all states, missing alaska
states = ['al','az','ar','ca','co','ct','de','fl','ga','hi','id','il','in','ia','ks',
          'ky','la','me','md','ma','mi','mn','ms','mo','mt','ne','nv','nh','nj','nm','ny',
          'nc','nd','oh','ok','or','pa','ri','sc','sd','tn','tx','ut','vt','va','wa','wv',
          'wi','wy']

# set url of site to scrap from
url = 'https://townhall.com/election/{}/president/{}/county'

# lists of candidates for each year, to match election year entered into function
candidates_2004 = ['George Bush', 'John Kerry', 'Ralph Nader']
candidates_2008 = ['Barack Obama', 'John McCain']
candidates_2012 = ['Barack Obama', 'Mitt Romney', 'Gary Johnson', 'Other']
candidates_2016 = ['Hillary Clinton', 'Donald Trump', 'Gary Johnson', 'Jill Stein']


def data(year, candidates):
    '''Function that scrapes the site townhall.com for election data by county. Loops over 
    each state's page and outputs data for election year entered.'''
    
    # initialize empty lists
    rep_vote = []
    dem_vote = []
    counties = []
    
    # scrapes each state's page using list of states from above
    for state in states:
        res = requests.get(url .format(year, state))
        soup = BeautifulSoup(res.content, 'lxml')
        
        
        rows = soup.find_all('tbody')
        
        if len(rows) == 2:
            rows = soup.find_all('tbody')[1]
        else:
            rows = soup.find_all('tbody')

        # loops over rows to pull out county names
        for row in rows:
            count = []
            x = row.find('div')
            # filters returns from div returns three types of values (the county name, -1 and None) 
            # then appends to empty list to create list of lists, which will later be flattend
            if type(x) != int and x != None:
                county = x.text
                count.append(county)
                count = list(set(count))
                count.sort()
                count = [str(x + ', ' + state.upper()) for x in count]
                counties.append(count)

        
        # Republican Results - finds data from text with 'republican' class
        # finds class of REP and GOP, different for different years
        
        
        row = rows.find_all('td', {'class':  ['REP', 'GOP']})

        for x in row:
            if '.' not in x.text and x.text not in candidates:
                rep_vote.append(x.text.replace(',', ''))

       # Democratic Results - finds data from text with 'democratic' class

        row = rows.find_all('td', {'class': 'DEM'})
        

        for x in row:
            if '.' not in x.text and x.text not in candidates:
                dem_vote.append(x.text.replace(',', ''))
        
    return counties, dem_vote, rep_vote

## 2004 Election

In [7]:
count, dem_vote, rep_vote = data(2004, candidates_2004)

# check to make sure lists are the same length
print(len(count), len(dem_vote), len(rep_vote))

# loop over counties list to convert from list of lists or strings, to list of strings
counties = [item for sublist in count for item in sublist]

# empty DataFrame
results_2004 = pd.DataFrame()

# add lists to empty Data Frame
results_2004['counties'] = counties
results_2004['dem_vote'] = dem_vote
results_2004['rep_vote'] = rep_vote

results_2004

3110 3110 3110


Unnamed: 0,counties,dem_vote,rep_vote
0,"Autauga, AL",4774,15212
1,"Baldwin, AL",15579,52910
2,"Barbour, AL",4826,5893
3,"Bibb, AL",2089,5471
4,"Blount, AL",3932,17364
5,"Bullock, AL",3210,1494
6,"Butler, AL",3409,4978
7,"Calhoun, AL",15076,29806
8,"Chambers, AL",5346,7618
9,"Cherokee, AL",3036,5919


## 2008 Election

In [8]:
count, dem_vote, rep_vote = data(2008, candidates_2008)

# check to make sure lists are the same length
print(len(count), len(dem_vote), len(rep_vote))

# loop over counties list to convert from list of lists or strings, to list of strings
counties = [item for sublist in count for item in sublist]

# empty DataFrame
results_2008 = pd.DataFrame()

# add lists to empty Data Frame
results_2008['counties'] = counties
results_2008['dem_vote'] = dem_vote
results_2008['rep_vote'] = rep_vote

results_2008

3112 3112 3112


Unnamed: 0,counties,dem_vote,rep_vote
0,"Autauga, AL",6091,17398
1,"Baldwin, AL",19362,61192
2,"Barbour, AL",5685,5862
3,"Bibb, AL",2289,6247
4,"Blount, AL",3518,20362
5,"Bullock, AL",4001,1389
6,"Butler, AL",4174,5472
7,"Calhoun, AL",16325,32326
8,"Chambers, AL",6782,8060
9,"Cherokee, AL",2299,7285


## 2012 Election

In [9]:
count, dem_vote, rep_vote = data(2012, candidates_2012)

# check to make sure lists are the same length
print(len(count), len(dem_vote), len(rep_vote))

# loop over counties list to convert from list of lists or strings, to list of strings
counties = [item for sublist in count for item in sublist]

# due to faulty data entry in townhall.com Hawaii's county Kalawao needs to be removed
# as well as the entries corresponding to Kauai, entered as '-', which offset data
del counties[518] 
del rep_vote[518]
del dem_vote[518]

rep_vote.remove('-') 
dem_vote.remove('-')

# empty DataFrame
results_2012 = pd.DataFrame()

# add lists to empty Data Frame
results_2012['counties'] = counties
results_2012['dem_vote'] = dem_vote
results_2012['rep_vote'] = rep_vote

results_2012

3113 3114 3114


Unnamed: 0,counties,dem_vote,rep_vote
0,"Autauga, AL",6354,17366
1,"Baldwin, AL",18329,65772
2,"Barbour, AL",5873,5539
3,"Bibb, AL",2200,6131
4,"Blount, AL",2961,20741
5,"Bullock, AL",4058,1250
6,"Butler, AL",4367,5081
7,"Calhoun, AL",15500,30272
8,"Chambers, AL",6853,7596
9,"Cherokee, AL",2126,7494


## 2016 Election

In [10]:
count, dem_vote, rep_vote = data(2016, candidates_2016)

# check to make sure lists are the same length
print(len(count), len(dem_vote), len(rep_vote))

# loop over counties list to convert from list of lists or strings, to list of strings
counties = [item for sublist in count for item in sublist]

# empty DataFrame
results_2016 = pd.DataFrame()

# add lists to empty Data Frame
results_2016['counties'] = counties
results_2016['dem_vote'] = dem_vote
results_2016['rep_vote'] = rep_vote

results_2016

3111 3111 3111


Unnamed: 0,counties,dem_vote,rep_vote
0,"Autauga, AL",5908,18110
1,"Baldwin, AL",18409,72780
2,"Barbour, AL",4848,5431
3,"Bibb, AL",1874,6733
4,"Blount, AL",2150,22808
5,"Bullock, AL",3530,1139
6,"Butler, AL",3716,4891
7,"Calhoun, AL",13197,32803
8,"Chambers, AL",5763,7803
9,"Cherokee, AL",1524,8809


In [15]:
results_2004.to_csv('./results_2004')

In [16]:
results_2016.to_csv('./results_2008')

In [17]:
results_2016.to_csv('./results_2012')

In [18]:
results_2016.to_csv('./results_2016')