In [82]:
# Analysis of Scraped council data 


In [83]:
from helpers.file_helper import FileHelper
import pandas as pd

# read scraped council data
output_file_path= "./data/cu_export_all_scraped.jsonl"
file_helper = FileHelper()
output_records = file_helper.read_jsonlines_all(output_file_path)
output_records_df = pd.DataFrame(output_records)

In [84]:
# Number of data available
total_records = output_records_df.shape[0]
print("Total records ", total_records)

Total records  7067


In [85]:
# columns in the data
output_records_df.columns

Index(['org_id', 'address', 'council', 'electorate_state',
       'electorate_federal', 'error_message', 'has_error', 'council_scraped',
       'electorate_state_scraped', 'is_council_correct',
       'is_electorate_state_correct', 'scraped_text'],
      dtype='object')

In [86]:
# 1. Errors
# 1.1 Address is empty in the sacommunity data
# Solution: Add address. Search the website or contact the person for address
empty_address_df = output_records_df[output_records_df["address"] == ""]
empty_address_df.shape
empty_address_count = empty_address_df.shape[0]
print("Empty address count ", empty_address_count)

Empty address count  50


In [87]:
# Errors. Exception
# other erros than empty address
# These erros could be scraping is forbidden in the url, too many requests, internal server error, etc
# Solution: check for error message, and retry scraping
errors_df = output_records_df[(output_records_df['has_error'] == True) & (output_records_df["address"] != "")]
errors_df.shape
errors_count = errors_df.shape[0]
print("Exceptions count ", errors_count)

Exceptions count  125


In [88]:
errors_df

Unnamed: 0,org_id,address,council,electorate_state,electorate_federal,error_message,has_error,council_scraped,electorate_state_scraped,is_council_correct,is_electorate_state_correct,scraped_text
2067,197500,40 Derlanger Avenue Nailsworth,City of Prospect,Enfield,Adelaide,Message: unknown error: net::ERR_ADDRESS_UNREA...,True,,,False,False,
2069,197504,"St Philip's Anglican Church, Broadview",City of Prospect,Enfield,Adelaide,Message: unknown error: net::ERR_INTERNET_DISC...,True,,,False,False,
2070,196951,"Women's Memorial Playing Field, St Marys",City of Mitcham,Elder,Boothby,Message: unknown error: net::ERR_INTERNET_DISC...,True,,,False,False,
2071,196952,"Colonel Light Gardens RSL, Colonel Light Gardens",City of Mitcham,Elder,Boothby,Message: unknown error: net::ERR_INTERNET_DISC...,True,,,False,False,
2072,197505,49 Prospect Rd Prospect,City of Prospect,Adelaide,Adelaide,Message: unknown error: net::ERR_INTERNET_DISC...,True,,,False,False,
...,...,...,...,...,...,...,...,...,...,...,...,...
3475,200345,"Gawler Sport & Recreation Centre, Gawler",Town of Gawler,Light,Spence,Message: unknown error: net::ERR_INTERNET_DISC...,True,,,False,False,
3476,200346,"1 Light Square, Gawler",Town of Gawler,Light,Spence,Message: unknown error: net::ERR_INTERNET_DISC...,True,,,False,False,
3492,199450,Cnr North Coast Rd and North Cape Rd Wisanger,Kangaroo Island Council,Mawson,Mayo,'str' object has no attribute 'get',True,,,False,False,
3494,199063,Lyndoch,The Barossa Council,Schubert,Barker,'str' object has no attribute 'get',True,,,False,False,


In [89]:
# Address is not recognised as a valid address by the source of scraping
# Update address to a valid one. Search the website or contact the person for address
address_not_found_df = output_records_df[output_records_df["scraped_text"].str.startswith("No results found")]
address_not_found_df.shape
invalid_address_count = address_not_found_df.shape[0]
print("Invalid address count ", invalid_address_count)

Invalid address count  690


In [90]:
# Council name mismatch
# Address has value and scraped data returned council name. This means wrongly recorded in the website
# Solution: Update the council name, and corresponding electoral state and federal state
council_name_mismatch_df = output_records_df[(output_records_df["is_council_correct"] == False) 
                                             & (output_records_df["address"] != "")
                                             & (~output_records_df["scraped_text"].str.startswith("No results found"))]
council_name_mismatch_df.shape
council_name_mismatch_count = council_name_mismatch_df.shape[0]
print("council_name_mismatch_count ",council_name_mismatch_count)

council_name_mismatch_count  940


In [91]:
council_name_mismatch_df.head()

Unnamed: 0,org_id,address,council,electorate_state,electorate_federal,error_message,has_error,council_scraped,electorate_state_scraped,is_council_correct,is_electorate_state_correct,scraped_text
2,193934,21 Sturt St,Berri Barmera Council,Chaffey,Barker,,False,City of Marion,WARRIPARINGA WARD,False,False,City of Marion\nCouncil Name City of Marion\nE...
10,193944,Edward St,District Council of Coober Pedy,Giles,Grey,,False,City of Marion,WOODLANDS WARD,False,False,City of Marion\nCouncil Name City of Marion\nE...
15,193949,2 Railway Pde,Regional Council of Goyder,Stuart,Grey,,False,Outback Communities,,False,False,Outback Communities\nCouncil Name Outback Comm...
16,193950,Hanson St,Light Regional Council,Schubert,Barker,,False,Regional Council of Goyder,BURRA WARD,False,False,Regional Council of Goyder\nCouncil Name Regio...
20,193955,7 Bruce Ave,Tatiara District Council,MacKillop,Barker,,False,District Council of Mount Remarkable,WILLOCHRA WARD,False,False,District Council of Mount Remarkable\nCouncil ...


In [92]:
# TODO: This seems some error
# https://www.lga.sa.gov.au/sa-councils/councils-listing#map returns electoral ward
# Need investigation betwen electoral state and electoral ward
# Electorate State mismatch
# Address has value and scraped data returned council name. Council name is correct, but electorate state is wrong
# Solution: Update the council name, and corresponding electoral state and federal state
# electorate_state_mismatch_df = output_records_df[(output_records_df["is_electorate_state_correct"] == False) 
#                                              & (output_records_df["address"] != "")
#                                              & (~output_records_df["scraped_text"].str.startswith("No results found"))]
# electorate_state_mismatch_df.shape

In [93]:
# electorate_state_mismatch_df.head(2)

In [94]:
# Summary
print("Total records ", total_records)
print("Empty address count ", empty_address_count)
print("Exceptions count ", errors_count)
print("Invalid address count ", invalid_address_count)
print("council_name_mismatch_count ",council_name_mismatch_count)

Total records  7067
Empty address count  50
Exceptions count  125
Invalid address count  690
council_name_mismatch_count  940
