In [None]:
# Analysis of Scraped council data 
import pandas as pd
df = pd.read_csv("./data/cu_export_all.csv")
df.columns

In [None]:
council_names = [str(c) for c in df['Organisati_Council'].unique().tolist()]
sorted(council_names)

In [None]:
from helpers.file_helper import FileHelper
import pandas as pd

# read scraped council data
output_file_path= "./data/cu_export_all_scraped.jsonl"
file_helper = FileHelper()
output_records = file_helper.read_jsonlines_all(output_file_path)
output_records_df = pd.DataFrame(output_records)

In [None]:
# Number of data available
total_records = output_records_df.shape[0]
print("Total records ", total_records)

In [None]:
# columns in the data
output_records_df.columns

In [None]:
# 1. Errors
# 1.1 Address is empty in the sacommunity data
# Solution: Add address. Search the website or contact the person for address
empty_address_df = output_records_df[output_records_df["address"] == ""]
empty_address_df.shape
empty_address_count = empty_address_df.shape[0]
print("Empty address count ", empty_address_count)

In [None]:
# Errors. Exception
# other erros than empty address
# These erros could be scraping is forbidden in the url, too many requests, internal server error, etc
# Solution: check for error message, and retry scraping
errors_df = output_records_df[(output_records_df['has_error'] == True) & (output_records_df["address"] != "")]
errors_df.shape
errors_count = errors_df.shape[0]
print("Exceptions count ", errors_count)

In [None]:
errors_df

In [None]:
# Address is not recognised as a valid address by the source of scraping
# Update address to a valid one. Search the website or contact the person for address
address_not_found_df = output_records_df[output_records_df["scraped_text"].str.startswith("No results found")]
address_not_found_df.shape
invalid_address_count = address_not_found_df.shape[0]
print("Invalid address count ", invalid_address_count)

In [None]:
# Council name mismatch
# Address has value and scraped data returned council name. This means wrongly recorded in the website
# Solution: Update the council name, and corresponding electoral state and federal state
council_name_mismatch_df = output_records_df[(output_records_df["is_council_correct"] == False) 
                                             & (output_records_df["address"] != "")
                                             & (~output_records_df["scraped_text"].str.startswith("No results found"))]
council_name_mismatch_df.shape
council_name_mismatch_count = council_name_mismatch_df.shape[0]
print("council_name_mismatch_count ",council_name_mismatch_count)

In [None]:
council_name_mismatch_df.head()

In [None]:
# TODO: This seems some error
# https://www.lga.sa.gov.au/sa-councils/councils-listing#map returns electoral ward
# Need investigation betwen electoral state and electoral ward
# Electorate State mismatch
# Address has value and scraped data returned council name. Council name is correct, but electorate state is wrong
# Solution: Update the council name, and corresponding electoral state and federal state
# electorate_state_mismatch_df = output_records_df[(output_records_df["is_electorate_state_correct"] == False) 
#                                              & (output_records_df["address"] != "")
#                                              & (~output_records_df["scraped_text"].str.startswith("No results found"))]
# electorate_state_mismatch_df.shape

In [None]:
# electorate_state_mismatch_df.head(2)

In [None]:
# Summary
print("Total records ", total_records)
print("Empty address count ", empty_address_count)
print("Exceptions count ", errors_count)
print("Invalid address count ", invalid_address_count)
print("council_name_mismatch_count ",council_name_mismatch_count)