In [1]:
import pandas as pd
import numpy as np
import glob
import re

In [2]:
! mkdir 04_transform_aec_electorates

A subdirectory or file 04_transform_aec_electorates already exists.


In [3]:
OUTPUT_DIR = "04_transform_aec_electorates"

# Read in Scraped Data

All of the scraped electorates will be in the output folder of the prior notebook. With each electorate as an indivdual csv file.

In [4]:
exported_files = glob.glob("03_extract_aec_electorates/*.csv")

Let's look at the first electorate to get an idea of the transformation we may need to apply.

In [5]:
print(exported_files[0])
df = pd.read_csv(exported_files[0])

03_extract_aec_electorates\adelaide.csv


In [6]:
df.head()

Unnamed: 0,State,Locality/Suburb,Postcode,Electorate,Redistributed Electorate,Other Locality(s)
0,SA,ADELAIDE,5000,Adelaide,,
1,SA,ALLENBY GARDENS,5009,Adelaide,,
2,SA,ANGLE PARK,5010,Adelaide,,
3,SA,ASHFORD,5035,Adelaide,,
4,SA,BLAIR ATHOL,5084,Adelaide,,


In [7]:
df.shape

(81, 6)

# Fix Column Headers

Let's normalise the column headers for a start.

In [8]:
def fix_column_headers(df):
    df.columns = [re.sub(r"[^\w]", "_", x.lower()) for x in df.columns]
    return df

In [9]:
df = fix_column_headers(df)
df.head()

Unnamed: 0,state,locality_suburb,postcode,electorate,redistributed_electorate,other_locality_s_
0,SA,ADELAIDE,5000,Adelaide,,
1,SA,ALLENBY GARDENS,5009,Adelaide,,
2,SA,ANGLE PARK,5010,Adelaide,,
3,SA,ASHFORD,5035,Adelaide,,
4,SA,BLAIR ATHOL,5084,Adelaide,,


In [10]:
df.shape

(81, 6)

# Value Checks

Let's have a look at most frequently occuring values in the columns. 

I would expect that the `state` column would have 1 unique values, as I assume that suburbs in the same electorate are in the same state.

In [11]:
df.describe(include="all")

Unnamed: 0,state,locality_suburb,postcode,electorate,redistributed_electorate,other_locality_s_
count,81,81.0,81.0,81,4.0,4.0
unique,3,75.0,,3,,
top,SA,2.0,,Adelaide,,
freq,73,4.0,,73,,
mean,,,4607.790123,,1234.0,1234.0
std,,,1343.346127,,0.0,0.0
min,,,3.0,,1234.0,1234.0
25%,,,5008.0,,1234.0,1234.0
50%,,,5034.0,,1234.0,1234.0
75%,,,5081.0,,1234.0,1234.0


## Invalid `state` values

It appears we have 3 unique values for state, let's see whats happening here.

In [12]:
current_state = df.loc[0]["state"]
print("current_state", current_state)

current_state SA


In [13]:
df.loc[df["state"] != current_state]

Unnamed: 0,state,locality_suburb,postcode,electorate,redistributed_electorate,other_locality_s_
20,1234,1234,1234,1234,1234.0,1234.0
21,1,2,3,4,,
42,1234,1234,1234,1234,1234.0,1234.0
43,1,2,3,4,,
64,1234,1234,1234,1234,1234.0,1234.0
65,1,2,3,4,,
79,1234,1234,1234,1234,1234.0,1234.0
80,1,2,3,4,,


Looking at the indexes, I suspect that the invalid values are header or footer values found the table (potentially the page numbers?). Let's drop the rows that contain numeric values.

In [14]:
def drop_numeric_rows_in_column(df, column):
    return df.loc[~df[column].apply(lambda x: re.sub(r"[^a-z\s]", "", x.lower()).strip()).replace("", np.nan).isnull()]

In [15]:
df = drop_numeric_rows_in_column(df, "state")

In [16]:
df["state"].value_counts()

SA    73
Name: state, dtype: int64

In [17]:
df.describe(include="all")

Unnamed: 0,state,locality_suburb,postcode,electorate,redistributed_electorate,other_locality_s_
count,73,73,73.0,73,0.0,0.0
unique,1,73,,1,,
top,SA,CLEARVIEW,,Adelaide,,
freq,73,1,,73,,
mean,,,5044.972603,,,
std,,,30.66756,,,
min,,,5000.0,,,
25%,,,5010.0,,,
50%,,,5035.0,,,
75%,,,5081.0,,,


## Dropping Columns

The `redistributed_electorate` and `other_locality_s_` columns are for the most part empty or contain information that is not of interest. Let's drop them.

In [18]:
def drop_redundent_cols(df):
    return df.drop(["redistributed_electorate", "other_locality_s_"], axis = 1)

In [19]:
df = drop_redundent_cols(df)

In [20]:
df.head()

Unnamed: 0,state,locality_suburb,postcode,electorate
0,SA,ADELAIDE,5000,Adelaide
1,SA,ALLENBY GARDENS,5009,Adelaide
2,SA,ANGLE PARK,5010,Adelaide
3,SA,ASHFORD,5035,Adelaide
4,SA,BLAIR ATHOL,5084,Adelaide


In [21]:
df.dtypes

state              object
locality_suburb    object
postcode            int64
electorate         object
dtype: object

## Casing of `locality_suburb`

At present `locality_suburb` is all uppercase, I'd prefer to have the values in title case. The is a possibility of getting it wrong e.g. "Van De Merwe" rather than "van de Merwe" - but in the case of suburb names I feel this is an acceptable risk.

In [22]:
def title_case_locality_suburb(df):
    df["locality_suburb"] = df["locality_suburb"].astype(str).str.title()
    return df

In [23]:
df = title_case_locality_suburb(df)
df.head()

Unnamed: 0,state,locality_suburb,postcode,electorate
0,SA,Adelaide,5000,Adelaide
1,SA,Allenby Gardens,5009,Adelaide
2,SA,Angle Park,5010,Adelaide
3,SA,Ashford,5035,Adelaide
4,SA,Blair Athol,5084,Adelaide


# Combining The Transformations

We've thus fair transformed one data file, however we need to repeat the transformations on all of the files. Defining a function to run each of the transformations is a step towards having a transformation pipeline.

In [24]:
def transform_electorate_df(file_path):
    df = pd.read_csv(file_path)
    df = fix_column_headers(df)
    df = drop_numeric_rows_in_column(df, "state")
    df = drop_numeric_rows_in_column(df, "locality_suburb")
    df = drop_numeric_rows_in_column(df, "electorate")
    df = drop_redundent_cols(df)
    df = title_case_locality_suburb(df)
    
    df = df[["electorate", "locality_suburb", "postcode", "state"]]
    
    return df

In addition to fixing the individual datafiles, let's combine them into a single DataFrame.

In [25]:
transformed_dfs = list()

for exported_file in exported_files:
    transformed_dfs.append(transform_electorate_df(exported_file))

In [26]:
electorates_df = pd.concat(transformed_dfs)

In [27]:
electorates_df.shape

(8463, 4)

In [28]:
electorates_df.describe(include="all")

Unnamed: 0,electorate,locality_suburb,postcode,state
count,8463,8463,8463,8463
unique,151,7444,2231,8
top,O'Connor,Darlington,872,NSW
freq,100,7,59,2703


# Handling Postcodes

Although we will not be storing the postcodes as numbers they should be able to parsed as numbers.
The reason for this is that a postcode with leading `0`s (such as Darwin) would be improperly converted from:
`"0800"` to `800`

In [29]:
# Validate that our postcodes can all be parsed as numbers
try:
    pd.to_numeric(electorates_df["postcode"], errors = "raise")
except ValueError:
    raise("Postcodes not expected values")

In [30]:
electorates_df[["postcode"]].head()

Unnamed: 0,postcode
0,5000
1,5009
2,5010
3,5035
4,5084


In order to preserve leading `0`s we will convert the postcodes to strings.
Some additional effort has been taken to remove any decimal component of the numbers as strings.

In [31]:
electorates_df["postcode"] = electorates_df["postcode"].astype(str).str.split(".").str[0]

In [32]:
electorates_df.head()

Unnamed: 0,electorate,locality_suburb,postcode,state
0,Adelaide,Adelaide,5000,SA
1,Adelaide,Allenby Gardens,5009,SA
2,Adelaide,Angle Park,5010,SA
3,Adelaide,Ashford,5035,SA
4,Adelaide,Blair Athol,5084,SA


In [33]:
electorates_df.dtypes

electorate         object
locality_suburb    object
postcode           object
state              object
dtype: object

In [34]:
for col in electorates_df.columns:
    electorates_df[col] = electorates_df[col].astype(str).str.strip()

# Remove Duplicates

Given we're applied our transformations there may be some duplicated rows, let's drop them if any exist.

In [35]:
electorates_df.shape

(8463, 4)

In [36]:
electorates_df.drop_duplicates().shape

(8436, 4)

In [37]:
electorates_df = electorates_df.drop_duplicates()

# Write out Transformed Data

In [38]:
electorates_df.to_csv(f"{OUTPUT_DIR}/output.csv",index = False)