In [29]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import urllib
import requests
from IPython.display import display, display_pretty, Javascript, HTML

In [30]:
# each page has a summary table that rolls up results at the state level
# get rid of it
def cond(x):
    if x:
        return x.startswith("table ec-table") and not "table ec-table ec-table-summary" in x
    else:
        return False

In [1]:
# list of state abbreviations
states = ['AL','AK','AZ','AR','CA','CO','CT','DC','DE','FL','GA','HI','ID','IL','IN','IA','KS','KY','LA','ME','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','OH','OK','OR','PA','RI','SC','SD','TN','TX','UT','VT','VA','WA','WV','WI','WY']

# headers for csv export
data = [['state_abbr', 'county_name', 'party', 'votes_total']]

AttributeError: 'list' object has no attribute 'lower'

In [32]:
# loop through each state's web page http://townhall.com/election/2016/president/%s/county, where %s is the state abbr
for state in states:
    for year in years:
        url = 'https://townhall.com/election/{}/president/{}/county'
        res = requests.get(url .format(year, states))
        soup = BeautifulSoup(res.content, 'lxml')

        # loop through each <table> tag with .ec-table class
        tables = soup.findAll('table', attrs={'class':cond})

        for table in tables:
            if table.findParent("table") is None:
                table_body = table.find('tbody')

                rows = table_body.find_all('tr')
                for row in rows:
                    cols = row.find_all('td')
                    # first tbody tr has four td
                    if len(cols) == 4:
                        # strip text from each td
                        divs = cols[0].find_all('div')
                        county = divs[0].text.strip()
                        party = cols[1]['class'][0]
                        total_votes = int(cols[2].text.strip().replace(',','').replace('-','0'))
                    # all other tbody tr have three td
                    else:
                        party = cols[1]['class'][0]
                        total_votes = int(cols[1].text.strip().replace(',','').replace('-','0'))

                    #combine each row's results
                    rowData = [state,county,party,total_votes]
                    data.append(rowData)

In [33]:
townhall = pd.DataFrame(data) # throw results in dataframe
new_header = townhall.iloc[0] #grab the first row for the header
townhall = townhall[1:] #take the data less the header row
townhall.columns = new_header #set the header row as the df header
townhall['votes_total'] = townhall['votes_total'].astype('float64')
print(townhall.shape[0])
townhall.head()

10251


Unnamed: 0,state_abbr,county_name,party,votes_total
1,AL,Autauga,REP,15212.0
2,AL,Autauga,DEM,4774.0
3,AL,Autauga,GRE,74.0
4,AL,Baldwin,REP,52910.0
5,AL,Baldwin,DEM,15579.0


In [34]:
# view by state
townhall[(townhall['state_abbr'] == 'AK')]

Unnamed: 0,state_abbr,county_name,party,votes_total
202,AK,Autauga,REP,15212.0
203,AK,Autauga,DEM,4774.0
204,AK,Autauga,GRE,74.0
205,AK,Baldwin,REP,52910.0
206,AK,Baldwin,DEM,15579.0
207,AK,Baldwin,GRE,371.0
208,AK,Barbour,REP,5893.0
209,AK,Barbour,DEM,4826.0
210,AK,Barbour,GRE,26.0
211,AK,Bibb,REP,5471.0


In [35]:
# view special cases
print(townhall[(townhall['state_abbr'] == 'NV') & (townhall['county_name'] == 'Carson City')])

Empty DataFrame
Columns: [state_abbr, county_name, party, votes_total]
Index: []


In [36]:
# fix townhall county name for Washington DC, Sainte Genevieve, MO, Oglala, SD
townhall.loc[townhall['state_abbr'] =='DC', 'county_name'] = 'District of Columbia'
townhall.loc[townhall['county_name'] == 'Sainte Genevieve', 'county_name'] = 'Ste. Genevieve County'
townhall.loc[townhall['county_name'] == 'Oglala Lakota', 'county_name'] = 'Oglala'
print(townhall[(townhall['county_name'] == 'District of Columbia') | (townhall['county_name'] == 'Ste. Genevieve County') | (townhall['county_name'] == 'Oglala')])

0    state_abbr           county_name party  votes_total
1408         DC  District of Columbia   REP      15212.0
1409         DC  District of Columbia   DEM       4774.0
1410         DC  District of Columbia   GRE         74.0
1411         DC  District of Columbia   REP      52910.0
1412         DC  District of Columbia   DEM      15579.0
1413         DC  District of Columbia   GRE        371.0
1414         DC  District of Columbia   REP       5893.0
1415         DC  District of Columbia   DEM       4826.0
1416         DC  District of Columbia   GRE         26.0
1417         DC  District of Columbia   REP       5471.0
1418         DC  District of Columbia   DEM       2089.0
1419         DC  District of Columbia   GRE         12.0
1420         DC  District of Columbia   REP      17364.0
1421         DC  District of Columbia   DEM       3932.0
1422         DC  District of Columbia   GRE         92.0
1423         DC  District of Columbia   DEM       3210.0
1424         DC  District of Co

In [37]:
# change 'Co.' to 'County' in county_name to match census county name
townhall['county_name'] = townhall['county_name'].apply(lambda x: x.replace('Co.','County').strip())
print(townhall[(townhall['state_abbr'] == 'NV') & (townhall['county_name'] == 'Carson City')])

Empty DataFrame
Columns: [state_abbr, county_name, party, votes_total]
Index: []


In [38]:
# combine state and county names
townhall['combined'] = townhall['state_abbr'] + townhall['county_name'].apply(lambda x: x.replace(' ','').lower())
print(townhall[(townhall['state_abbr'] == 'NV') & (townhall['county_name'] == 'Carson City')])

Empty DataFrame
Columns: [state_abbr, county_name, party, votes_total, combined]
Index: []


In [59]:
townhall.to_csv('./townhall.csv')