In [1]:
import os
from bs4 import BeautifulSoup
from splinter import Browser
from splinter.exceptions import ElementDoesNotExist
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
from time import sleep

In [2]:
! mkdir 03_extract_aec_electorates

A subdirectory or file 03_extract_aec_electorates already exists.


In [3]:
OUTPUT_DIR = "03_extract_aec_electorates"

# AEC Localitites

The Australian Electoral Commission website has a page where you can search for localities by postcode or electorate  - https://electorate.aec.gov.au/LocalitySearchResults.aspx

Given there isn't an API we will scrape these pages for every electorate, that is present in the in the representives data as exported from the [theyvoteforyou.org.au API](https://theyvoteforyou.org.au/help/data#people)

In [4]:
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

[WDM] - Current google-chrome version is 88.0.4324
[WDM] - Get LATEST driver version for 88.0.4324
[WDM] - Driver [C:\Users\ryanc\.wdm\drivers\chromedriver\win32\88.0.4324.96\chromedriver.exe] found in cache


 


# Read in Representatives

Then get the unique electorates from Representitives we downloaded from theyvoteforyou.org.au

In [5]:
representatives_df = pd.read_csv("02_transform_they_vote_for_you/output.csv")

In [6]:
electorates = list(representatives_df["electorate"].unique())

Defining a function for getting a url for a specific electorate.

In [7]:
def get_electorate_url(electorate):
    return f'https://electorate.aec.gov.au/LocalitySearchResults.aspx?filter={electorate}&filterby=Electorate'

# Navigating to Page and Dealing with Pagnation

The result for a locality/suburb search may be split over several pages. Furthermore not all of the pages are listed in the results. In order to get all of the results we will need to click through each of the page numbers and then on the `...` link to get additional pages.
<img src="resources/AEC_Localities.png">

Given the above example, when we scrape the web page we'll have a DataFrame with page links available to us.

In [8]:
# example pagination dataframe to demonstrate `get_max_pages` function

example_pagination_df = pd.DataFrame([
    {0: 1, 1: 2, 2: 3, 3: 4, 4: 5, 5: "..."}
])

example_pagination_df


Unnamed: 0,0,1,2,3,4,5
0,1,2,3,4,5,...


What we need to determine, is not only how many pages there. But we need to also determine if their are more page numbers that aren't currently being displayed - denoted by the `...` text being present.

In [9]:
def get_max_pages(df):
    # get the maximum number of pages    
    pages_df = df.T
    pages_df.columns = ["pages"]
    
    # by converting to number and coercing non-numeric values
    # and if there is a non-numeric value we will have a 
    # `null` value which we can detect as confimration of
    # additional pages
    pages_df["pages"] = pd.to_numeric(pages_df["pages"], errors ="coerce")
    
    more_pages = any(pages_df["pages"].isnull())
    
    try:
        max_pages = pages_df["pages"].max()
    except:
        max_pages = None
    
    # return the known max number of pages
    # and if there are more pages to be fetched
    return max_pages, more_pages

As per our pagination example we will get back 5 known pages, and confirmation that there are more page numbers to be fetched.

In [10]:
get_max_pages(example_pagination_df)

(5.0, True)

# Extracting Page Data

Given the data we're interested in is contained in `<table>` element, we can use the functionality in [pandas.read_html](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_html.html)
to parse the webpage return any tables as DataFrames.
<img src="resources/AEC_Localities_html.png">



In [11]:
def get_page_data(browser, for_page, do_pagination = False):
    
    if do_pagination:
        print("\tpaging to get additional results")
    
    print(f"\tgetting page {for_page}")

    if do_pagination or for_page > 1:
        try:
            # first look at how many links we get back because in the case of 
            # "..." we might get two let's assume the the last item 
            # we get is the one we want
            
            link_text = str(for_page)
            if do_pagination:
                link_text = "..."
            
            found_links = browser.links.find_by_text(link_text)
            found_links[-1].click()
                        
            sleep(0.5)
            
        except ElementDoesNotExist:
            return None, None, None

    # extract pages from browser html
    # given that it is in a table element we
    # can read it into a pandas dataframe 
    # using pd.read_html
    tables = pd.read_html(browser.html)

    max_pages, more_pages = get_max_pages(tables[1])
    
    if max_pages == for_page:
        more_pages = False    
    
    # this occurs when we've attempted to paginate by clicking on "..."
    # but it takes us backwards, so we've likely got all of the pages
    if max_pages < for_page:
        print(f"\tWe have navigated backwards: max_pages: {max_pages} last page: {for_page}")
        return None, None, None        
    
    return tables[0], max_pages, more_pages

Now this bit of code could do with some refatoring as there's a lot of repeated logic. Part of the challenge is we're having to step through multiple sub-pages and finding out if there are more pages due to the "..." link

In [12]:
def save_electorate_data(electorate):
    global OUTPUT_DIR
    
    destination_file = f"{OUTPUT_DIR}/{electorate.lower()}.csv"
    
    if os.path.exists(destination_file):
        print(f"{electorate} - skipping file already exists")
        return
    
    url = get_electorate_url(electorate)
    
    print(f"{electorate} - {url}")

    # list for storing the localities
    locality_dfs = list()

    # first navigate to page
    browser.visit(url)
    sleep(1)
    
    # counter for the current page
    current_page = 1
    
    # the result of this is a list of dataframes
    # the first being the locality data (electorate / postcode)
    # the second being the number of pages
    tables = pd.read_html(browser.html)

    # get the page data, the maximum number of pages, and
    # if there are more pages
    page_df, max_pages, more_pages = get_page_data(browser, current_page)
    
    print(f"\tmax pages: {max_pages} - more pages: {more_pages}")

    # get the first locality data
    locality_dfs.append(page_df)
    
    while current_page != max_pages:
        current_page += 1
        
        page_df, max_pages, more_pages = get_page_data(browser, current_page)
        if page_df is None:
            print(f"\tCouldn't find a link to click {current_page}")
            break
        
        locality_dfs.append(page_df)
        
        if current_page == max_pages and more_pages:
            # because pagination takes us to the next page
            # increment our current_page
            current_page += 1
            page_df, max_pages, more_pages = get_page_data(browser, current_page, True)
            
            # in this case we have probably click the "..." link to go backwards
            if page_df is None:
                break
            
            print(f"\tmax pages: {max_pages} - more pages: {more_pages}")            
            locality_dfs.append(page_df)
            

    electorate_df = pd.concat(locality_dfs)
    electorate_df.to_csv(destination_file, index = False)
    

# Begin Scraping

This will initiate the scraping, it will take some time so go make yourself a cup of something to drink `:)`

In [13]:
for electorate in sorted(electorates):
    save_electorate_data(electorate)
    

Adelaide - skipping file already exists
Aston - skipping file already exists
Ballarat - skipping file already exists
Banks - skipping file already exists
Barker - skipping file already exists
Barton - skipping file already exists
Bass - skipping file already exists
Bean - skipping file already exists
Bendigo - skipping file already exists
Bennelong - skipping file already exists
Berowra - skipping file already exists
Blair - skipping file already exists
Blaxland - skipping file already exists
Bonner - skipping file already exists
Boothby - skipping file already exists
Bowman - skipping file already exists
Braddon - skipping file already exists
Bradfield - skipping file already exists
Brand - skipping file already exists
Brisbane - skipping file already exists
Bruce - skipping file already exists
Burt - skipping file already exists
Calare - skipping file already exists
Calwell - skipping file already exists
Canberra - skipping file already exists
Canning - skipping file already exists
C

In [14]:
browser.quit()