In [1]:
import pandas as pd
import numpy as np
import glob
import re

In [2]:
OUTPUT_DIR = "04_transform_aec_electorates"

In [3]:
exported_files = glob.glob("03_extract_aec_electorates/*.csv")

In [4]:
df = pd.read_csv(exported_files[0])

# Fix Column Headers

In [5]:
def fix_column_headers(df):
    df.columns = [re.sub(r"[^\w]", "_", x.lower()) for x in df.columns]
    return df

In [6]:
df = fix_column_headers(df)
df.head()

Unnamed: 0,state,locality_suburb,postcode,electorate,redistributed_electorate,other_locality_s_
0,SA,ADELAIDE,5000,Adelaide,,
1,SA,ALLENBY GARDENS,5009,Adelaide,,
2,SA,ANGLE PARK,5010,Adelaide,,
3,SA,ASHFORD,5035,Adelaide,,
4,SA,BLAIR ATHOL,5084,Adelaide,,


In [7]:
df.describe(include="all")

Unnamed: 0,state,locality_suburb,postcode,electorate,redistributed_electorate,other_locality_s_
count,81,81.0,81.0,81,4.0,4.0
unique,3,75.0,,3,,
top,SA,1234.0,,Adelaide,,
freq,73,4.0,,73,,
mean,,,4607.790123,,1234.0,1234.0
std,,,1343.346127,,0.0,0.0
min,,,3.0,,1234.0,1234.0
25%,,,5008.0,,1234.0,1234.0
50%,,,5034.0,,1234.0,1234.0
75%,,,5081.0,,1234.0,1234.0


In [8]:
df.loc[df["state"] != "SA"]

Unnamed: 0,state,locality_suburb,postcode,electorate,redistributed_electorate,other_locality_s_
20,1234,1234,1234,1234,1234.0,1234.0
21,1,2,3,4,,
42,1234,1234,1234,1234,1234.0,1234.0
43,1,2,3,4,,
64,1234,1234,1234,1234,1234.0,1234.0
65,1,2,3,4,,
79,1234,1234,1234,1234,1234.0,1234.0
80,1,2,3,4,,


In [9]:
def drop_numeric_rows_in_column(df, column):
    return df.loc[~df[column].apply(lambda x: re.sub(r"[^a-z\s]", "", x.lower()).strip()).replace("", np.nan).isnull()]

In [10]:
df = drop_numeric_rows_in_column(df, "state")

In [11]:
df["state"].value_counts()

SA    73
Name: state, dtype: int64

In [12]:
df.describe(include="all")

Unnamed: 0,state,locality_suburb,postcode,electorate,redistributed_electorate,other_locality_s_
count,73,73,73.0,73,0.0,0.0
unique,1,73,,1,,
top,SA,MILE END SOUTH,,Adelaide,,
freq,73,1,,73,,
mean,,,5044.972603,,,
std,,,30.66756,,,
min,,,5000.0,,,
25%,,,5010.0,,,
50%,,,5035.0,,,
75%,,,5081.0,,,


In [13]:
def drop_redundent_cols(df):
    return df.drop(["redistributed_electorate", "other_locality_s_"], axis = 1)

In [14]:
df = drop_redundent_cols(df)

In [15]:
df.head()

Unnamed: 0,state,locality_suburb,postcode,electorate
0,SA,ADELAIDE,5000,Adelaide
1,SA,ALLENBY GARDENS,5009,Adelaide
2,SA,ANGLE PARK,5010,Adelaide
3,SA,ASHFORD,5035,Adelaide
4,SA,BLAIR ATHOL,5084,Adelaide


In [16]:
df.dtypes

state              object
locality_suburb    object
postcode            int64
electorate         object
dtype: object

In [17]:
def title_case_locality_suburb(df):
    df["locality_suburb"] = df["locality_suburb"].astype(str).str.title()
    return df

In [18]:
df = title_case_locality_suburb(df)
df.head()

Unnamed: 0,state,locality_suburb,postcode,electorate
0,SA,Adelaide,5000,Adelaide
1,SA,Allenby Gardens,5009,Adelaide
2,SA,Angle Park,5010,Adelaide
3,SA,Ashford,5035,Adelaide
4,SA,Blair Athol,5084,Adelaide


In [19]:
def transform_electorate_df(file_path):
    df = pd.read_csv(file_path)
    df = fix_column_headers(df)
    df = drop_numeric_rows_in_column(df, "state")
    df = drop_numeric_rows_in_column(df, "locality_suburb")
    df = drop_numeric_rows_in_column(df, "electorate")
    df = drop_redundent_cols(df)
    df = title_case_locality_suburb(df)
    
    df = df[["electorate", "locality_suburb", "postcode", "state"]]
    
    return df

In [20]:
transformed_dfs = list()

for exported_file in exported_files:
    transformed_dfs.append(transform_electorate_df(exported_file))



In [21]:
electorates_df = pd.concat(transformed_dfs)

In [22]:
electorates_df.shape

(17715, 4)

In [23]:
electorates_df.describe(include="all")

Unnamed: 0,electorate,locality_suburb,postcode,state
count,17715,17715,17715,17715
unique,151,15631,2753,8
top,Lingiari,Red Hill,872,NSW
freq,1526,10,474,5024


check out the postcodes

In [24]:
try:
    pd.to_numeric(electorates_df["postcode"], errors = "raise")
except ValueError:
    raise("Postcodes not expected values")

In [25]:
electorates_df["postcode"] = electorates_df["postcode"].astype(str).str.split(".").str[0]

In [26]:
electorates_df.head()

Unnamed: 0,electorate,locality_suburb,postcode,state
0,Adelaide,Adelaide,5000,SA
1,Adelaide,Allenby Gardens,5009,SA
2,Adelaide,Angle Park,5010,SA
3,Adelaide,Ashford,5035,SA
4,Adelaide,Blair Athol,5084,SA


In [27]:
for col in electorates_df.columns:
    electorates_df[col] = electorates_df[col].astype(str).str.strip()

In [28]:
electorates_df.shape

(17715, 4)

In [29]:
electorates_df.drop_duplicates().shape

(17668, 4)

In [30]:
electorates_df = electorates_df.drop_duplicates()

In [31]:
electorates_df.to_csv(f"{OUTPUT_DIR}/output.csv",index = False)