In [1]:
from bs4 import BeautifulSoup
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
from time import sleep

# AEC Localitites

The Australian Electoral Commission websites has a page where you can search for localities by postcode or electorate.

We will scrape these pages for every electorate in the representives we exported from theyvoteforyou.org.au

In [2]:
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

[WDM] - Current google-chrome version is 84.0.4147
[WDM] - Get LATEST driver version for 84.0.4147
[WDM] - Driver [C:\Users\ryanc\.wdm\drivers\chromedriver\win32\84.0.4147.30\chromedriver.exe] found in cache


 


# Read in Representatives

Then get the unique electorates

In [3]:
representatives_df = pd.read_csv("02_transform_they_vote_for_you/output.csv")

In [4]:
electorates = list(representatives_df["electorate"].unique())

Defining a function for getting a url for a specific electorate.

In [5]:
def get_electorate_url(electorate):
    return f'https://electorate.aec.gov.au/LocalitySearchResults.aspx?filter={electorate}&filterby=Electorate'

# Navigating to Page and Dealing with Pagnation

Import to have a sleep between requests so we don't deny-listed.

In [6]:
def get_max_pages(df):
    # get the maximum number of pages
    pages_df = df.T
    pages_df.columns = ["pages"]
    pages_df["pages"] = pd.to_numeric(pages_df["pages"], errors ="coerce")
    
    more_pages = any(pages_df["pages"].isnull())
    
    try:
        max_pages = pages_df["pages"].max()
    except:
        max_pages = None
        
    return max_pages, more_pages

In [7]:
def get_next_page_data(browser, for_page, electorate):   
    print(f"\tgetting page {for_page} for {electorate}")

    browser.links.find_by_text(str(page)).click()
    sleep(0.5)

    # extract pages from browser html
    tables = pd.read_html(browser.html)

    max_pages, more_pages = get_max_pages(tables[1])
    
    return tables[0], max_pages, more_pages

Now this bit of code could do with some refatoring as there's a lot of repeated logic. Part of the challenge is we're having to step through multiple sub-pages and finding out if there are more pages due to the "..." link

In [8]:
for electorate in electorates:

    url = get_electorate_url(electorate)

    # list for storing the localities
    locality_dfs = list()

    # first navigate to page
    browser.visit(url)
    sleep(1)
    
    # counter for the current page
    current_page = 1

    # the result of this is a list of dataframes
    # the first being the locality data (electorate / postcode)
    # the second being the number of pages
    tables = pd.read_html(browser.html)

    # determine the number of max pages
    max_pages, more_pages = get_max_pages(tables[1])
    
    print(f"{electorate} - number of pages {max_pages}")

    # get the first locality data
    locality_dfs.append(tables[0])

    # interate over each of the pages
    # skipping the first page as we already the data
    if isinstance(max_pages, int):
        for page in range(current_page + 1, max_pages + 1):
            
            data_df, next_max_pages, more_pages = get_next_page_data(
                browser, page, electorate
            )
            
            locality_dfs.append(data_df)
        
        current_page = page
        max_pages = next_max_pages
        
        # handle the "..." more pages link
        while more_pages:
            for page in range(current_page + 1, max_pages + 1):

                data_df, max_pages, more_pages = get_next_page_data(
                    browser, page, electorate
                )

                locality_dfs.append(data_df)            
            
    # if more_pages:
    electorate_df = pd.concat(locality_dfs)
    electorate_df.to_csv(f"01_extract_aec_electorates/{electorate}.csv", index = False)


Grayndler - number of pages 2
	getting page 2 for Grayndler
Menzies - number of pages 2
	getting page 2 for Menzies
Cunningham - number of pages 3
	getting page 2 for Cunningham
	getting page 3 for Cunningham
Watson - number of pages 2
	getting page 2 for Watson
Holt - number of pages nan
Blaxland - number of pages 2
	getting page 2 for Blaxland
Franklin - number of pages 5.0
Isaacs - number of pages 2
	getting page 2 for Isaacs
Dickson - number of pages 3
	getting page 2 for Dickson
	getting page 3 for Dickson
Richmond - number of pages 5.0
Hunter - number of pages 5.0
Mitchell - number of pages nan
Flinders - number of pages 3
	getting page 2 for Flinders
	getting page 3 for Flinders
Swan - number of pages 2
	getting page 2 for Swan
Ballarat - number of pages 5.0
Bowman - number of pages 2
	getting page 2 for Bowman
Farrer - number of pages 5.0
Forrest - number of pages 5.0
Corio - number of pages 2
	getting page 2 for Corio
Cook - number of pages 2
	getting page 2 for Cook
Blair - n

	getting page 3 for Spence
	getting page 4 for Spence
Clark - number of pages 2
	getting page 2 for Clark
Nicholls - number of pages 5.0
Cooper - number of pages 2
	getting page 2 for Cooper
Bean - number of pages 4
	getting page 2 for Bean
	getting page 3 for Bean
	getting page 4 for Bean
