In [1]:
import pandas as pd
import numpy as np
import datetime
import glob
import re 
import os
import time
from csv import reader

In [2]:
#makes floats diplay commas and two decimals
pd.options.display.float_format = '{:,.2f}'.format

# makes ints display commas 
class _IntArrayFormatter(pd.io.formats.format.GenericArrayFormatter):

    def _format_strings(self):
        formatter = self.formatter or (lambda x: ' {:,}'.format(x))
        fmt_values = [formatter(x) for x in self.values]
        return fmt_values

pd.io.formats.format.IntArrayFormatter = _IntArrayFormatter

pd.set_option('display.max_columns', None)

## Scraping the Registered Voter Population
I'm scraping all of the Voter Registration Statistics table rows and putting the date, title, and link into the vr_tbl.csv file. I'm going to use the file to loop through the links to get to the files.

In [4]:
from requests_html import HTML, HTMLSession
import csv
import urllib

In [5]:
# Go to https://elections.wi.gov/index.php/publications/statistics/registration
session = HTMLSession()
r = session.get('https://elections.wi.gov/index.php/publications/statistics/registration')

In [6]:
# Capture the table of Registered Voter Population Statistics over the past
table = r.html.find('tbody', first=True)
rows = table.find('tr')

# Capture the most recent, top entry
most_recent_date = rows[0].find('time', first=True).text
most_recent = rows[0].find('a', first=True).attrs['href']

### Get County and Wards Files

In [8]:
# stores url to get to the page with the files
starter_url = 'https://elections.wi.gov'
url = starter_url + most_recent

# turns American style date to normal dates like this: 20210301
date_pattern = re.compile(r'(\d{2})/(\d{2})/(\d{4})')
date = date_pattern.sub(r'\3\1\2', most_recent_date)

In [9]:
# Captures the tables with the files
month = session.get(url)
table = month.html.find('tbody', first=True)
file_rows = table.find('tr')

In [10]:
for file_row in file_rows:
    title = file_row.find('a', first=True).text
    file = file_row.find('a', first=True).attrs['href']
    
    if re.search(r'[c|C]ounty', title):
        #print(title)
        f = session.get(file)
        with open('scrapped_files/vr_county_{}.xlsx'.format(date), 'wb') as outfile:
            outfile.write(f.content)
    elif re.search(r'[w|W]ard', title):
        #print(title)
        f = session.get(file)
        with open('scrapped_files/vr_muni_{}.xlsx'.format(date), 'wb') as outfile:
            outfile.write(f.content)
    time.sleep(1)

## Scraping the Absentee Ballot Stats

In [11]:
# Go the page with all of the AVEV entries
avev_url = 'https://elections.wi.gov/publications/statistics/absentee'
avev = session.get(avev_url)

# Captures the table with all of the AVEV entries
avev_tbl = avev.html.find('tbody', first=True)
rows = avev_tbl.find('tr')

In [12]:
for row in rows: # loop through all of the AVEV entries
    row_title = row.find('a', first=True).text
    row_link = row.find('a', first=True).attrs['href']
    row_date = row.find('time', first=True).text
    
    if re.search(r'February 16, 2021 Spring Primary', row_title): # Checks if the entry is for Feb 2021
        avev_link = starter_url+row_link
        date = date_pattern.sub(r'\3\1\2', row_date)
        
        # Captures the table with the files
        day = session.get(avev_link)
        day_tbl = day.html.find('tbody', first=True)
        file_rows = day_tbl.find('tr')
        
        for file_row in file_rows: # Loops through the two files
            title = file_row.find('a', first=True).text
            file = file_row.find('a', first=True).attrs['href']

            if re.search(r'[c|C]ounty', title):
                #print(title)
                f = session.get(file)
                with open('scrapped_files/avev_county_{}.csv'.format(date), 'wb') as outfile:
                    outfile.write(f.content)
            elif re.search(r'[m|M]uni', title):
                #print(title)
                f = session.get(file)
                with open('scrapped_files/avev_muni_{}.csv'.format(date), 'wb') as outfile:
                    outfile.write(f.content)
            time.sleep(1)
    else: # don't want to loop through all of the AVEV entries not for Feb 2021
        break
    
    time.sleep(1)

# Munging

In [3]:
files = glob.glob('scrapped_files/*')

## Dataframe the Files

In [4]:
avev_ctys = []
avev_munis = []

for file in files:
    file_name = re.sub('scrapped_files/','',file)
    file_name = re.sub(r'\.(csv|xlsx)','',file_name)
    features = re.split("\_", file_name)
    
    layer = features[0]
    geo = features[1]
    date = features[2]
    
    if layer == 'vr':
        if geo == 'county':
            cty = pd.read_excel(file, dtype={'CountyCode':str})
            cty.dropna(axis='columns', how='all', inplace=True)
            cty.dropna(axis='rows', how='all', inplace=True)
            cty['vr_date'] = date
        elif geo == 'muni':
            muni = pd.read_excel(file, header=1, dtype={'Hindi':str})
            muni = muni.groupby(['Hindi']).aggregate({muni.columns[-1]:'sum'})
            muni['vr_date'] = date
    else:
        if geo == 'county':
            avev_cty = pd.read_csv(file, dtype={'HINDI':str})
            avev_cty['avev_date'] = date
            
            avev_ctys.append(avev_cty)
        elif geo == 'muni':
            avev_muni = pd.read_csv(file, dtype={'HINDI':str})
            avev_muni['avev_date'] = date
            
            avev_munis.append(avev_muni)

In [5]:
avev_ctys_df = pd.concat(avev_ctys)

In [6]:
avev_munis_df = pd.concat(avev_munis)

## Join the Dataframes

### Counties

In [7]:
cty.columns = ['HINDI', 'County', 'Registered Voters', 'vr_date']

In [8]:
cty.loc[73, 'HINDI'] = '99999'

In [9]:
cty = cty[['HINDI', 'Registered Voters', 'vr_date']]

In [10]:
avev_ctys_df

Unnamed: 0,Election,HINDI,Jurisdiction,AbsenteeApplications,BallotsSent,BallotsReturned,InPersonAbsentee,avev_date
0,2021 Spring Primary,01,ADAMS COUNTY,583,387,1,0.00,20210127
1,2021 Spring Primary,02,ASHLAND COUNTY,657,527,0,0.00,20210127
2,2021 Spring Primary,03,BARRON COUNTY,1218,1124,3,0.00,20210127
3,2021 Spring Primary,04,BAYFIELD COUNTY,638,299,0,0.00,20210127
4,2021 Spring Primary,05,BROWN COUNTY,13804,13081,1,0.00,20210127
...,...,...,...,...,...,...,...,...
68,2021 Spring Primary,69,WAUPACA COUNTY,1949,1940,851,41.00,20210218
69,2021 Spring Primary,70,WAUSHARA COUNTY,758,750,295,7.00,20210218
70,2021 Spring Primary,71,WINNEBAGO COUNTY,8577,8565,4190,180.00,20210218
71,2021 Spring Primary,72,WOOD COUNTY,3333,3318,1817,63.00,20210218


In [11]:
county = pd.merge(avev_ctys_df, cty, how='left', on='HINDI')

In [12]:
county.fillna(0, inplace=True)

In [13]:
county[['AbsenteeApplications', 'BallotsSent', 'BallotsReturned', 'InPersonAbsentee', 'Registered Voters']] = county[['AbsenteeApplications', 'BallotsSent', 'BallotsReturned', 'InPersonAbsentee', 'Registered Voters']].astype(int)

### Municipalities

In [14]:
muni.reset_index(inplace=True)

In [15]:
muni.columns = ['HINDI', 'Registered Voters', 'vr_date']

In [16]:
muni = muni.append(cty.loc[73], ignore_index=True)

In [17]:
munis = pd.merge(avev_munis_df, muni, how='left', on='HINDI')

In [18]:
munis.fillna(0, inplace=True)

In [19]:
munis[['AbsenteeApplications', 'BallotsSent', 'BallotsReturned', 'InPersonAbsentee', 'Registered Voters']] = munis[['AbsenteeApplications', 'BallotsSent', 'BallotsReturned', 'InPersonAbsentee', 'Registered Voters']].astype(int)

In [20]:
munis.avev_date.unique()[0]

'20210127'

In [21]:
munis

Unnamed: 0,Election,HINDI,Jurisdiction,AbsenteeApplications,BallotsSent,BallotsReturned,InPersonAbsentee,avev_date,Registered Voters,vr_date
0,2021 Spring Primary,01002,TOWN OF ADAMS - ADAMS COUNTY,21,21,0,0,20210127,850,20210301
1,2021 Spring Primary,01004,TOWN OF BIG FLATS - ADAMS COUNTY,16,0,0,0,20210127,645,20210301
2,2021 Spring Primary,01006,TOWN OF COLBURN - ADAMS COUNTY,8,0,0,0,20210127,160,20210301
3,2021 Spring Primary,01008,TOWN OF DELL PRAIRIE - ADAMS COUNTY,39,37,0,0,20210127,1138,20210301
4,2021 Spring Primary,01010,TOWN OF EASTON - ADAMS COUNTY,40,0,0,0,20210127,664,20210301
...,...,...,...,...,...,...,...,...,...,...
16300,2021 Spring Primary,72251,CITY OF MARSHFIELD - MULTIPLE COUNTIES,996,994,517,21,20210218,11535,20210301
16301,2021 Spring Primary,72261,CITY OF NEKOOSA - WOOD COUNTY,100,99,34,0,20210218,1433,20210301
16302,2021 Spring Primary,72271,CITY OF PITTSVILLE - WOOD COUNTY,20,20,12,3,20210218,502,20210301
16303,2021 Spring Primary,72291,CITY OF WISCONSIN RAPIDS - WOOD COUNTY,1050,1049,469,30,20210218,10535,20210301


In [26]:
county.loc[county['Jurisdiction'] == 'BURNETT COUNTY']

Unnamed: 0,Election,HINDI,Jurisdiction,AbsenteeApplications,BallotsSent,BallotsReturned,InPersonAbsentee,avev_date,Registered Voters,vr_date
6,2021 Spring Primary,7,BURNETT COUNTY,528,285,0,0,20210127,11497,20210301
79,2021 Spring Primary,7,BURNETT COUNTY,534,305,1,0,20210128,11497,20210301
152,2021 Spring Primary,7,BURNETT COUNTY,540,308,1,0,20210129,11497,20210301
225,2021 Spring Primary,7,BURNETT COUNTY,557,425,12,0,20210201,11497,20210301
298,2021 Spring Primary,7,BURNETT COUNTY,568,456,18,0,20210203,11497,20210301
371,2021 Spring Primary,7,BURNETT COUNTY,579,465,48,0,20210208,11497,20210301
444,2021 Spring Primary,7,BURNETT COUNTY,587,472,56,0,20210209,11497,20210301
517,2021 Spring Primary,7,BURNETT COUNTY,607,495,153,5,20210215,11497,20210301
590,2021 Spring Primary,7,BURNETT COUNTY,607,495,178,5,20210216,11497,20210301
663,2021 Spring Primary,7,BURNETT COUNTY,607,512,216,5,20210218,11497,20210301


In [33]:
county[:-1]

Unnamed: 0,Election,HINDI,Jurisdiction,AbsenteeApplications,BallotsSent,BallotsReturned,InPersonAbsentee,avev_date,Registered Voters,vr_date
0,2021 Spring Primary,01,ADAMS COUNTY,583,387,1,0,20210127,13499,20210301
1,2021 Spring Primary,02,ASHLAND COUNTY,657,527,0,0,20210127,10187,20210301
2,2021 Spring Primary,03,BARRON COUNTY,1218,1124,3,0,20210127,29105,20210301
3,2021 Spring Primary,04,BAYFIELD COUNTY,638,299,0,0,20210127,12190,20210301
4,2021 Spring Primary,05,BROWN COUNTY,13804,13081,1,0,20210127,165731,20210301
...,...,...,...,...,...,...,...,...,...,...
724,2021 Spring Primary,68,WAUKESHA COUNTY,23649,23305,10757,845,20210218,295634,20210301
725,2021 Spring Primary,69,WAUPACA COUNTY,1949,1940,851,41,20210218,32840,20210301
726,2021 Spring Primary,70,WAUSHARA COUNTY,758,750,295,7,20210218,15264,20210301
727,2021 Spring Primary,71,WINNEBAGO COUNTY,8577,8565,4190,180,20210218,107576,20210301
