In [1]:
import os
from bs4 import BeautifulSoup
from splinter import Browser
from splinter.exceptions import ElementDoesNotExist
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
from time import sleep

# AEC Localitites

The Australian Electoral Commission websites has a page where you can search for localities by postcode or electorate.

We will scrape these pages for every electorate in the representives we exported from theyvoteforyou.org.au

In [2]:
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

[WDM] - Current google-chrome version is 84.0.4147
[WDM] - Get LATEST driver version for 84.0.4147
[WDM] - Driver [C:\Users\ryanc\.wdm\drivers\chromedriver\win32\84.0.4147.30\chromedriver.exe] found in cache


 


# Read in Representatives

Then get the unique electorates

In [3]:
representatives_df = pd.read_csv("02_transform_they_vote_for_you/output.csv")

In [4]:
electorates = list(representatives_df["electorate"].unique())

Defining a function for getting a url for a specific electorate.

In [5]:
def get_electorate_url(electorate):
    return f'https://electorate.aec.gov.au/LocalitySearchResults.aspx?filter={electorate}&filterby=Electorate'

# Navigating to Page and Dealing with Pagnation

Import to have a sleep between requests so we don't deny-listed.

In [6]:
def get_max_pages(df):
    # get the maximum number of pages
    pages_df = df.T
    pages_df.columns = ["pages"]
    pages_df["pages"] = pd.to_numeric(pages_df["pages"], errors ="coerce")
    
    more_pages = any(pages_df["pages"].isnull())
    
    try:
        max_pages = pages_df["pages"].max()
    except:
        max_pages = None
        
    return max_pages, more_pages

In [7]:
def get_page_data(browser, for_page, do_pagination = False):
    
    if do_pagination:
        print("\tpaging to get additional results")
    
    print(f"\tgetting page {for_page}")

    if do_pagination or for_page > 1:
        try:
            # first look at how many links we get back because in the case of 
            # "..." we might get two let's assume the the last item 
            # we get is the one we want
            
            link_text = str(for_page)
            if do_pagination:
                link_text = "..."
            
            found_links = browser.links.find_by_text(link_text)
            found_links[-1].click()
                        
            sleep(0.5)
            
        except ElementDoesNotExist:
            return None, None, None

    # extract pages from browser html
    tables = pd.read_html(browser.html)

    max_pages, more_pages = get_max_pages(tables[1])
    
    if max_pages == for_page:
        more_pags = False    
    
    # this occurs when we've attempted to paginate by clicking on "..."
    # but it takes us backwards, so we've likely got all of the pages
    if max_pages < for_page:
        print(f"\tWe have navigated backwards: max_pages: {max_pages} last page: {for_page}")
        return None, None, None        
    
    return tables[0], max_pages, more_pages

Now this bit of code could do with some refatoring as there's a lot of repeated logic. Part of the challenge is we're having to step through multiple sub-pages and finding out if there are more pages due to the "..." link

In [8]:
def save_electorate_data(electorate):   
    destination_file = f"01_extract_aec_electorates/{electorate.lower()}.csv"
    
    if os.path.exists(destination_file):
        print(f"{electorate} - skipping file already exists")
        return
    
    url = get_electorate_url(electorate)
    
    print(f"{electorate} - {url}")

    # list for storing the localities
    locality_dfs = list()

    # first navigate to page
    browser.visit(url)
    sleep(1)
    
    # counter for the current page
    current_page = 1
    
    # the result of this is a list of dataframes
    # the first being the locality data (electorate / postcode)
    # the second being the number of pages
    tables = pd.read_html(browser.html)

    # get the page data, the maximum number of pages, and
    # if there are more pages
    page_df, max_pages, more_pages = get_page_data(browser, current_page)
    
    print(f"\tmax pages: {max_pages} - more pages: {more_pages}")

    # get the first locality data
    locality_dfs.append(page_df)
    
    while current_page != max_pages:
        current_page += 1
        
        page_df, max_pages, more_pages = get_page_data(browser, current_page)
        if page_df is None:
            print(f"\tCouldn't find a link to click {current_page}")
            break
        
        locality_dfs.append(page_df)
        
        if current_page == max_pages and more_pages:
            # because pagination takes us to the next page
            # increment our current_page
            current_page += 1
            page_df, max_pages, more_pages = get_page_data(browser, current_page, True)
            
            # in this case we have probably click the "..." link to go backwards
            if page_df is None:
                break
            
            print(f"\tmax pages: {max_pages} - more pages: {more_pages}")            
            locality_dfs.append(page_df)
            

    electorate_df = pd.concat(locality_dfs)
    electorate_df.to_csv(destination_file, index = False)
    

In [9]:
for electorate in electorates:
    save_electorate_data(electorate)

Grayndler - skipping file already exists
Menzies - skipping file already exists
Cunningham - skipping file already exists
Watson - skipping file already exists
Holt - skipping file already exists
Blaxland - skipping file already exists
Franklin - skipping file already exists
Isaacs - skipping file already exists
Dickson - skipping file already exists
Richmond - skipping file already exists
Hunter - skipping file already exists
Mitchell - skipping file already exists
Flinders - skipping file already exists
Swan - skipping file already exists
Ballarat - skipping file already exists
Bowman - skipping file already exists
Farrer - skipping file already exists
Forrest - skipping file already exists
Corio - skipping file already exists
Cook - skipping file already exists
Blair - https://electorate.aec.gov.au/LocalitySearchResults.aspx?filter=Blair&filterby=Electorate
	getting page 1
	max pages: 5.0 - more pages: True
	getting page 2
	getting page 3
	getting page 4
	getting page 5
	paging to g

	paging to get additional results
	getting page 11
	max pages: 15.0 - more pages: True
	getting page 12
	getting page 13
	getting page 14
	getting page 15
	paging to get additional results
	getting page 16
	max pages: 20.0 - more pages: True
	getting page 17
	getting page 18
	getting page 19
	getting page 20
	paging to get additional results
	getting page 21
	max pages: 21.0 - more pages: True
Forde - https://electorate.aec.gov.au/LocalitySearchResults.aspx?filter=Forde&filterby=Electorate
	getting page 1
	max pages: 3 - more pages: False
	getting page 2
	getting page 3
Hasluck - https://electorate.aec.gov.au/LocalitySearchResults.aspx?filter=Hasluck&filterby=Electorate
	getting page 1
	max pages: 3 - more pages: False
	getting page 2
	getting page 3
Leichhardt - https://electorate.aec.gov.au/LocalitySearchResults.aspx?filter=Leichhardt&filterby=Electorate
	getting page 1
	max pages: 5.0 - more pages: True
	getting page 2
	getting page 3
	getting page 4
	getting page 5
	paging to get a

	max pages: 28.0 - more pages: True
	getting page 27
	getting page 28
	paging to get additional results
	getting page 29
	We have navigated backwards: max_pages: 25.0 last page: 29
Gellibrand - https://electorate.aec.gov.au/LocalitySearchResults.aspx?filter=Gellibrand&filterby=Electorate
	getting page 1
	max pages: 2 - more pages: False
	getting page 2
Hinkler - https://electorate.aec.gov.au/LocalitySearchResults.aspx?filter=Hinkler&filterby=Electorate
	getting page 1
	max pages: 5 - more pages: False
	getting page 2
	getting page 3
	getting page 4
	getting page 5
Hotham - https://electorate.aec.gov.au/LocalitySearchResults.aspx?filter=Hotham&filterby=Electorate
	getting page 1
	max pages: 2 - more pages: False
	getting page 2
Hume - https://electorate.aec.gov.au/LocalitySearchResults.aspx?filter=Hume&filterby=Electorate
	getting page 1
	max pages: 5.0 - more pages: True
	getting page 2
	getting page 3
	getting page 4
	getting page 5
	paging to get additional results
	getting page 6
	m

	getting page 1
	max pages: 3 - more pages: False
	getting page 2
	getting page 3
Lyons - https://electorate.aec.gov.au/LocalitySearchResults.aspx?filter=Lyons&filterby=Electorate
	getting page 1
	max pages: 5.0 - more pages: True
	getting page 2
	getting page 3
	getting page 4
	getting page 5
	paging to get additional results
	getting page 6
	max pages: 10.0 - more pages: True
	getting page 7
	getting page 8
	getting page 9
	getting page 10
	paging to get additional results
	getting page 11
	max pages: 15.0 - more pages: True
	getting page 12
	getting page 13
	getting page 14
	getting page 15
	paging to get additional results
	getting page 16
	We have navigated backwards: max_pages: 10.0 last page: 16
Macarthur - https://electorate.aec.gov.au/LocalitySearchResults.aspx?filter=Macarthur&filterby=Electorate
	getting page 1
	max pages: 2 - more pages: False
	getting page 2
Macquarie - https://electorate.aec.gov.au/LocalitySearchResults.aspx?filter=Macquarie&filterby=Electorate
	getting p

	max pages: 19.0 - more pages: True
	getting page 17
	getting page 18
	getting page 19
	paging to get additional results
	getting page 20
	We have navigated backwards: max_pages: 15.0 last page: 20
New England - https://electorate.aec.gov.au/LocalitySearchResults.aspx?filter=New England&filterby=Electorate
	getting page 1
	max pages: 5.0 - more pages: True
	getting page 2
	getting page 3
	getting page 4
	getting page 5
	paging to get additional results
	getting page 6
	max pages: 10.0 - more pages: True
	getting page 7
	getting page 8
	getting page 9
	getting page 10
	paging to get additional results
	getting page 11
	max pages: 15.0 - more pages: True
	getting page 12
	getting page 13
	getting page 14
	getting page 15
	paging to get additional results
	getting page 16
	max pages: 18.0 - more pages: True
	getting page 17
	getting page 18
	paging to get additional results
	getting page 19
	We have navigated backwards: max_pages: 15.0 last page: 19
Bennelong - https://electorate.aec.gov.

	getting page 1
	max pages: 2 - more pages: False
	getting page 2
Reid - https://electorate.aec.gov.au/LocalitySearchResults.aspx?filter=Reid&filterby=Electorate
	getting page 1
	max pages: 2 - more pages: False
	getting page 2
Ryan - https://electorate.aec.gov.au/LocalitySearchResults.aspx?filter=Ryan&filterby=Electorate
	getting page 1
	max pages: 2 - more pages: False
	getting page 2
Stirling - https://electorate.aec.gov.au/LocalitySearchResults.aspx?filter=Stirling&filterby=Electorate
	getting page 1
	max pages: 2 - more pages: False
	getting page 2
Sturt - https://electorate.aec.gov.au/LocalitySearchResults.aspx?filter=Sturt&filterby=Electorate
	getting page 1
	max pages: 4 - more pages: False
	getting page 2
	getting page 3
	getting page 4
Warringah - https://electorate.aec.gov.au/LocalitySearchResults.aspx?filter=Warringah&filterby=Electorate
	getting page 1
	max pages: 2 - more pages: False
	getting page 2
Wentworth - https://electorate.aec.gov.au/LocalitySearchResults.aspx?fil