In [1]:
import pandas as pd
import re
import json

pd.set_option('display.max_columns', 100)

In [2]:
class PlanningData:
    def __init__(self, year, f_path, sheet_number):
        self.year = year
        self.f_path = f_path
        self.sheet_number = sheet_number
        self.df = self.load_data()

    def load_data(self):
        file_path = f'data/{self.f_path}'
        return pd.read_excel(file_path, engine='pyxlsb', sheet_name=self.sheet_number)
    
    def clean_column_names(self):
        self.df.columns = [
            re.sub('__c$', '', c).lower().replace(' ', '_').replace('__', '_')
            for c in self.df.columns
        ]

    def rename_column_names(self, column_map):
        self.df = self.df.rename(columns=column_map)

    def drop_columns(self, columns_to_drop):
        self.df = self.df.drop(columns=columns_to_drop, errors='ignore')


In [3]:
planning_years = {
    2009: ('VBA-DataVic-Building-Permits-2009.xlsb', 1),
    2010: ('VBA-DataVic-Building-Permits-2010.xlsb', 0),
    2011: ('VBA-DataVic-Building-Permits-2011.xlsb', 0),
    2012: ('VBA-DataVic-Building-Permits-2012.xlsb', 0),
    2013: ('VBA-DataVic-Building-Permits-2013.xlsb', 0),
    2014: ('VBA-DataVic-Building-Permits-2014.xlsb', 0),
    2015: ('VBA-DataVic-Building-Permits-2015.xlsb', 0),
    2016: ('VBA-DataVic-Building-Permits-2016.xlsb', 0),
    2017: ('VBA-DataVic-Building-Permits-2017.xlsb', 1),
    2018: ('VBA-DataVic-Building-Permits-2018.xlsb', 1),
    2019: ('VBA-DataVic-Building-Permits-2019.xlsb', 1),
    2020: ('VBA-DataVic-Building-Permits-2020.xlsb', 1),
    2021: ('VBA-DataVic-Building-Permits-2021-Dec.xlsb', 1),
    2022: ('December-2022-Raw-data.xlsb', 1),
    2023: ('20240067-Raw-Data-December-2023.xlsb', 1),
    2024: ('20240219-Raw-Data-January-2024.xlsb', 1)
}

In [4]:
planning_dict = {year: PlanningData(year, f_path, sheet_number) for year, (f_path, sheet_number) in planning_years.items()}

In [5]:
# Clean column names
for year in planning_dict:
    planning_dict[year].clean_column_names()

### EDA on columns


In [6]:
def print_means(start, end, colname):
    print(f'--- {colname} ---')
    for i in range(start, end+1):
        print(i, ':', round(pd.to_numeric(planning_dict[i].df[colname], errors='coerce').mean(), 2))
    print('')
    

In [17]:
def print_value_counts(start, end, colname):
    print(f'--- {colname} ---')
    for i in range(start, end+1):
        print(i, ':', ', '.join(map(str, planning_dict[i].df[colname].value_counts().iloc[:3].index)))
    print('')    

In [18]:
print_value_counts(2011,2024, 'permit_stage_number')

--- permit_stage_number ---
2011 : 0, 1, 2
2012 : 0, 1, 2
2013 : 0, 1, 2
2014 : 0, 1, 2
2015 : 0, 1, 2
2016 : 0, 1, 2
2017 : 0, 1, 2
2018 : 0, 1, 2
2019 : 0, 1, 2
2020 : 0, 1, 2
2021 : 0, 1, 2
2022 : 0, 1, 2
2023 : 0, 1, 2
2024 : 0.0, 1.0, 2.0



In [19]:
print_value_counts(2011,2024, 'basis_month_y')

--- basis_month_y ---
2011 : 2011
2012 : 2012
2013 : 2013
2014 : 2014
2015 : 2015
2016 : 2016
2017 : 2017
2018 : 2018
2019 : 2019
2020 : 2020
2021 : 2021
2022 : 2022
2023 : 2023
2024 : 2024.0



In [20]:
print_value_counts(2011,2024, 'basis_month_m')

--- basis_month_m ---
2011 : 11, 3, 9
2012 : 10, 5, 8
2013 : 10, 5, 11
2014 : 10, 5, 9
2015 : 9, 10, 7
2016 : 11, 4, 5
2017 : 10, 8, 5
2018 : 10, 5, 11
2019 : 10, 6, 5
2020 : 11, 7, 10
2021 : 3, 5, 8
2022 : 3, 8, 5
2023 : 10, 11, 3
2024 : 1.0



In [21]:
print('LEVY PAID')
print_means(2011,2019, 'reported_levy_amount')
#print_means(2011,2019, 'calculated_levy_amount') # reported and calculated are very similar
print_means(2020,2024, 'original_levy_paid')

LEVY PAID
--- reported_levy_amount ---
2011 : 308.31
2012 : 313.46
2013 : 336.76
2014 : 326.4
2015 : 354.13
2016 : 370.87
2017 : 404.32
2018 : 447.52
2019 : 473.21

--- original_levy_paid ---
2020 : 805.47
2021 : 963.16
2022 : 1320.44
2023 : 1654.05
2024 : 1596.57



In [22]:
print('REPORTED COST OF WORKS')
print_means(2011,2024, 'reported_cost_of_works')

REPORTED COST OF WORKS
--- reported_cost_of_works ---
2011 : 238281.75
2012 : 238720.86
2013 : 243458.9
2014 : 258638.11
2015 : 277357.73
2016 : 291855.67
2017 : 317009.1
2018 : 349650.15
2019 : 375377.98
2020 : 359781.49
2021 : 349278.61
2022 : 418014.99
2023 : 482841.02
2024 : 956965.62



In [23]:
print('SITE STREET')
print_value_counts(2011,2019, 'site_street')
print_value_counts(2020,2023, 'site_street_name')
print_value_counts(2024,2024, 'cleaned_site_street_name')

SITE STREET
--- site_street ---
2011 : HIGH STREET, COLLINS STREET, BURWOOD HIGHWAY
2012 : HIGH STREET, COLLINS STREET, PRINCES HIGHWAY
2013 : HIGH STREET, COLLINS STREET, VICTORIA STREET
2014 : HIGH STREET, COLLINS STREET, STATION STREET
2015 : HIGH STREET, COLLINS STREET, BOURKE STREET
2016 : COLLINS STREET, HIGH STREET, MAIN STREET
2017 : COLLINS STREET, HIGH STREET, MAIN STREET
2018 : COLLINS STREET, HIGH STREET, BOURKE STREET
2019 : COLLINS STREET, HIGH STREET, Collins Street

--- site_street_name ---
2020 : High Street, Collins Street, Victoria Street
2021 : High Street, Collins Street, Station Street
2022 : Collins Street, High Street, Main Street
2023 : Collins Street, High Street, Main Street

--- cleaned_site_street_name ---
2024 : Collins Street, High Street, Nepean Highway



In [24]:
print('SITE SUBURB')
print_value_counts(2011,2019, 'site_suburb')
print_value_counts(2020,2024, 'site_town_suburb')

SITE SUBURB
--- site_suburb ---
2011 : POINT COOK, PAKENHAM, MELBOURNE
2012 : MELBOURNE, CRAIGIEBURN, POINT COOK
2013 : MELBOURNE, CRAIGIEBURN, POINT COOK
2014 : MELBOURNE, CRAIGIEBURN, POINT COOK
2015 : MELBOURNE, POINT COOK, CRAIGIEBURN
2016 : MELBOURNE, POINT COOK, CLYDE NORTH
2017 : MELBOURNE, CLYDE NORTH, POINT COOK
2018 : MELBOURNE, CLYDE NORTH, POINT COOK
2019 : MELBOURNE, WERRIBEE, CLYDE NORTH

--- site_town_suburb ---
2020 : Tarneit, Truganina, Wollert
2021 : Tarneit, Truganina, Clyde North
2022 : Tarneit, Clyde North, Melbourne
2023 : Tarneit, Clyde North, Melbourne
2024 : Tarneit, Wollert, Clyde North



In [25]:
print('MUNICIPALITY')
print_value_counts(2011,2019, 'municipal_name')
print_value_counts(2020,2024, 'site_municipality')

MUNICIPALITY
--- municipal_name ---
2011 : Wyndham, Greater Geelong, Whittlesea
2012 : Greater Geelong, Casey, Whittlesea
2013 : Greater Geelong, Casey, Wyndham
2014 : Greater Geelong, Casey, Wyndham
2015 : Greater Geelong, Casey, Wyndham
2016 : Casey, Greater Geelong, Wyndham
2017 : Wyndham, Casey, Greater Geelong
2018 : Wyndham, Casey, Greater Geelong
2019 : Wyndham, Greater Geelong, Casey

--- site_municipality ---
2020 : Wyndham , Greater Geelong , Casey 
2021 : Melton , Greater Geelong , Wyndham 
2022 : Greater Geelong, Melton, Casey
2023 : Casey , Greater Geelong , Melton 
2024 : Casey , Greater Geelong , Wyndham 



In [26]:
print_value_counts(2011,2024, 'municipal_full_name')

--- municipal_full_name ---
2011 : Wyndham, City of, Greater Geelong, City of, Whittlesea, City of
2012 : Greater Geelong, City of, Casey, City of, Whittlesea, City of
2013 : Greater Geelong, City of, Casey, City of, Wyndham, City of
2014 : Greater Geelong, City of, Casey, City of, Wyndham, City of
2015 : Greater Geelong, City of, Casey, City of, Wyndham, City of
2016 : Casey, City of, Greater Geelong, City of, Wyndham, City of
2017 : Wyndham, City of, Casey, City of, Greater Geelong, City of
2018 : Wyndham, City of, Casey, City of, Greater Geelong, City of
2019 : Wyndham, City of, Greater Geelong, City of, Casey, City of
2020 : Wyndham, City of, Greater Geelong, City of, Casey, City of
2021 : Melton, Shire of, Greater Geelong, City of, Wyndham, City of
2022 : Greater Geelong, City of, Melton, Shire of, Casey, City of
2023 : Greater Geelong, City of, Melton, Shire of, Casey, City of
2024 : Casey, City of, Greater Geelong, City of, Wyndham, City of



In [27]:
print_value_counts(2011,2024, 'region')

--- region ---
2011 : Metropolitan, Rural
2012 : Metropolitan, Rural
2013 : Metropolitan, Rural
2014 : Metropolitan, Rural
2015 : Metropolitan, Rural
2016 : Metropolitan, Rural
2017 : Metropolitan, Rural
2018 : Metropolitan, Rural
2019 : Metropolitan, Rural
2020 : Metropolitan, Rural
2021 : Metropolitan, Rural
2022 : Metropolitan, Rural
2023 : Metropolitan, Rural
2024 : Metropolitan, Rural



In [28]:
print_value_counts(2011,2024, 'sub_region')

--- sub_region ---
2011 : Outer Melbourne, Inner Melbourne, South West
2012 : Outer Melbourne, Inner Melbourne, South West
2013 : Outer Melbourne, Inner Melbourne, South West
2014 : Outer Melbourne, Inner Melbourne, South West
2015 : Outer Melbourne, Inner Melbourne, South West
2016 : Outer Melbourne, Inner Melbourne, South West
2017 : Outer Melbourne, Inner Melbourne, South West
2018 : Outer Melbourne, Inner Melbourne, South West
2019 : Outer Melbourne, Inner Melbourne, South West
2020 : Outer Melbourne, Inner Melbourne, South West
2021 : Outer Melbourne, Inner Melbourne, South West
2022 : Outer Melbourne, Inner Melbourne, 0x2a
2023 : Outer Melbourne, Inner Melbourne, South West
2024 : Outer Melbourne, Inner Melbourne, South West



In [29]:
print_value_counts(2011,2024, 'sub_region1')

--- sub_region1 ---
2011 : South Eastern, Eastern, Inner East
2012 : South Eastern, Eastern, Inner East
2013 : South Eastern, Inner East, Eastern
2014 : South Eastern, Inner East, Eastern
2015 : South Eastern, Eastern, Inner East
2016 : South Eastern, Eastern, Inner East
2017 : South Eastern, Eastern, Inner East
2018 : South Eastern, Eastern, South Western
2019 : South Eastern, South Western, Eastern
2020 : South Eastern, South Western, Western
2021 : South Eastern, Western, Eastern
2022 : South Eastern, 0x2a, Western
2023 : South Eastern, Western, Eastern
2024 : South Eastern, Inner North, Inner East



In [30]:
print('ALLOTMENT AREA')
print_means(2011,2024, 'allotment_area')

ALLOTMENT AREA
--- allotment_area ---
2011 : 11864.51
2012 : 15123.09
2013 : 17279.69
2014 : 16151.65
2015 : 20604.46
2016 : 21924.12
2017 : 33038.56
2018 : 37622.86
2019 : 40424.45
2020 : 84743.08
2021 : 114454.06
2022 : 72901.63
2023 : 97620.88
2024 : 133012.04



In [31]:
print('BUILDER SUBURB')
print_value_counts(2011,2019, 'builder_suburb')
print_value_counts(2020,2024, 'builder_town_suburb')

BUILDER SUBURB
--- builder_suburb ---
2011 : SOUTH MELBOURNE, MELBOURNE, MOUNT WAVERLEY
2012 : MOUNT WAVERLEY, SOUTH MELBOURNE, SYNDAL
2013 : TULLAMARINE, MULGRAVE, MOUNT WAVERLEY
2014 : MELBOURNE, TULLAMARINE, MULGRAVE
2015 : MELBOURNE, MOUNT WAVERLEY, PORT MELBOURNE
2016 : MELBOURNE, MOUNT WAVERLEY, DOCKLANDS
2017 : MOUNT WAVERLEY, DOCKLANDS, MELBOURNE
2018 : MELBOURNE, MOUNT WAVERLEY, DOCKLANDS
2019 : MELBOURNE, DOCKLANDS, MOUNT WAVERLEY

--- builder_town_suburb ---
2020 : Mount Waverley, Melbourne, Mulgrave
2021 : Mount Waverley, Melbourne, Derrimut
2022 : Mount Waverley, Melbourne, Derrimut
2023 : Mount Waverley, Derrimut, Melbourne
2024 : Mulgrave, Melbourne, Derrimut



In [32]:
print_value_counts(2011,2024, 'builder_state')

--- builder_state ---
2011 : VIC, NSW, Vic
2012 : VIC, NSW, Vic
2013 : VIC, NSW, Vic
2014 : VIC, NSW, Vic
2015 : VIC, NSW, QLD
2016 : VIC, NSW, QLD
2017 : VIC, NSW, Vic
2018 : VIC, NSW, QLD
2019 : VIC, Vic, Victoria
2020 : VIC, Vic, Victoria
2021 : VIC, Vic, Victoria
2022 : VIC, Vic, Victoria
2023 : VIC, Vic, Victoria
2024 : VIC, Vic, Victoria



In [33]:
print('BUILDER POSTCODE')
print_value_counts(2011,2019, 'builder_pcode')
print_value_counts(2020,2024, 'builder_postcode')

BUILDER POSTCODE
--- builder_pcode ---
2011 : 3149, 3205, 3030
2012 : 3149, 3030, 3205
2013 : 3149, 3030, 3043
2014 : 3149, 3030, 3043
2015 : 3149, 3030, 3207
2016 : 3149, 3030, 3008
2017 : 3149, 3008, 3030
2018 : 3149, 3008, 3030
2019 : 3149, 3149, 3008

--- builder_postcode ---
2020 : VIC, 3149, 3008
2021 : 3149, 3008, 3030
2022 : 3149, 3008, 3030
2023 : 3149, 3030, 3008
2024 : 3149.0, 3170.0, 3008.0



In [34]:
print('FLOOR MATERIAL')
print_value_counts(2011,2019, 'material_code_floor')
print_value_counts(2020,2024, 'floor_material')

FLOOR MATERIAL
--- material_code_floor ---
2011 : 0, 20, 40
2012 : 0, 20, 40
2013 : 0, 20, 40
2014 : 0, 20, 40
2015 : 0, 20, 40
2016 : 0, 20, 40
2017 : 0.0, 20.0, 40.0
2018 : 20.0, 0.0, 40.0
2019 : 20.0, 0.0, 40.0

--- floor_material ---
2020 : 20.0, 0.0, 40.0
2021 : 20.0, 0.0, 40.0
2022 : 20.0, 0.0, 40.0
2023 : 20.0, 0.0, 40.0
2024 : 20.0, 0.0, 40.0



In [35]:
print('FRAME MATERIAL')
print_value_counts(2011,2019, 'material_code_frame')
print_value_counts(2020,2024, 'frame_material')

FRAME MATERIAL
--- material_code_frame ---
2011 : 40, 0, 60
2012 : 40, 0, 60
2013 : 0, 40, 60
2014 : 0, 40, 60
2015 : 0, 40, 60
2016 : 0, 40, 60
2017 : 0.0, 40.0, 60.0
2018 : 40, 0, 60
2019 : 40.0, 0.0, 60.0

--- frame_material ---
2020 : 40.0, 60.0, 0.0
2021 : 40.0, 60.0, 0.0
2022 : 40.0, 60.0, 0.0
2023 : 40.0, 60.0, 0.0
2024 : 40.0, 60.0, 0.0



In [36]:
print('ROOF MATERIAL')
print_value_counts(2011,2019, 'material_code_roof')
print_value_counts(2020,2024, 'roof_cladding_material')

ROOF MATERIAL
--- material_code_roof ---
2011 : 0, 60, 10
2012 : 0, 60, 10
2013 : 0, 60, 10
2014 : 0, 60, 10
2015 : 0, 60, 10
2016 : 0, 60, 10
2017 : 0.0, 60.0, 10.0
2018 : 0.0, 60.0, 10.0
2019 : 60.0, 10.0, 0.0

--- roof_cladding_material ---
2020 : 60.0, 10.0, 0.0
2021 : 60.0, 10.0, 0.0
2022 : 60.0, 10.0, 0.0
2023 : 60.0, 10.0, 0.0
2024 : 60.0, 10.0, 0.0



In [37]:
print('WALL MATERIAL')
print_value_counts(2011,2019, 'material_code_walls')
print_value_counts(2020,2024, 'external_wall_material')

WALL MATERIAL
--- material_code_walls ---
2011 : 0, 12, 60
2012 : 0, 12, 60
2013 : 0, 12, 60
2014 : 0, 12, 60
2015 : 0, 12, 60
2016 : 0, 12, 60
2017 : 0, 12, 60
2018 : 0.0, 12.0, 60.0
2019 : 12.0, 0.0, 60.0

--- external_wall_material ---
2020 : 12.0, 0.0, 60.0
2021 : 12.0, 0.0, 60.0
2022 : 12.0, 0.0, 80.0
2023 : 12.0, 0.0, 80.0
2024 : 12.0, 0.0, 80.0



In [38]:
print('PRE-WORK DWELLINGS')
print_means(2011,2019, 'dwellings_before_work')
print_means(2020,2024, 'number_of_existing_dwellings')

PRE-WORK DWELLINGS
--- dwellings_before_work ---
2011 : 0.36
2012 : 0.48
2013 : 0.37
2014 : 0.37
2015 : 0.4
2016 : 0.34
2017 : 0.36
2018 : 0.34
2019 : 0.47

--- number_of_existing_dwellings ---
2020 : 0.43
2021 : 0.43
2022 : 0.41
2023 : 0.42
2024 : 0.38



In [39]:
print('POST-WORK DWELLINGS')
print_means(2011,2019, 'dwellings_after_work')
print_means(2020,2024, 'number_of_new_dwellings')

POST-WORK DWELLINGS
--- dwellings_after_work ---
2011 : 0.77
2012 : 1.1
2013 : 0.6
2014 : 0.94
2015 : 0.78
2016 : 0.77
2017 : 0.75
2018 : 0.85
2019 : 1.06

--- number_of_new_dwellings ---
2020 : 0.93
2021 : 0.85
2022 : 0.79
2023 : 0.79
2024 : 0.67



In [40]:
print_means(2011,2024, 'number_of_storeys')

--- number_of_storeys ---
2011 : 0.93
2012 : 0.94
2013 : 0.95
2014 : 0.97
2015 : 0.97
2016 : 0.97
2017 : 1.0
2018 : 1.06
2019 : 1.75
2020 : 1.45
2021 : 1.51
2022 : 1.38
2023 : 2.33
2024 : 1.55



In [42]:
planning_dict[2011].df['permit_date']

0         35465
1         35466
2         35478
3         35478
4         35478
          ...  
101618    40908
101619    40908
101620    40908
101621    40908
101622    40908
Name: permit_date, Length: 101623, dtype: int64

In [43]:
print('DEMOLISHED')
print_means(2011,2019, 'number_demolished')
print_means(2020,2024, 'number_of_dwellings_demolished')

DEMOLISHED
--- number_demolished ---
2011 : 0.05
2012 : 0.39
2013 : 0.09
2014 : 0.08
2015 : 0.09
2016 : 0.1
2017 : 0.12
2018 : 0.1
2019 : 0.12

--- number_of_dwellings_demolished ---
2020 : 0.1
2021 : 0.08
2022 : 0.1
2023 : 0.09
2024 : 0.08



In [44]:
print('FLOOR AREA')
print_means(2011,2019, 'floor_area')
print_means(2020,2024, 'total_floor_area')

FLOOR AREA
--- floor_area ---
2011 : 202.48
2012 : 200.05
2013 : 215.15
2014 : 216.07
2015 : 226.07
2016 : 227.4
2017 : 258.78
2018 : 291.31
2019 : 421.34

--- total_floor_area ---
2020 : 419.52
2021 : 346.69
2022 : 419.24
2023 : 463.87
2024 : 403.38



In [45]:
print_value_counts(2011,2019, 'multiple_dwellings')

--- multiple_dwellings ---
2011 : 1
2012 : 1
2013 : 1
2014 : 1
2015 : 1, 2
2016 : 1
2017 : 1, 0
2018 : 1, 0, 2
2019 : 1.0, 0.0



In [46]:
print_means(2011,2019, 'cost_of_works_domestic')

--- cost_of_works_domestic ---
2011 : 39406.34
2012 : 35007.71
2013 : 32672.58
2014 : 39305.3
2015 : 48161.79
2016 : 50679.85
2017 : 53005.71
2018 : 57815.78
2019 : 59605.8



In [47]:
print('BACV/DBDRV LEVY')
print_means(2011,2019, 'calculated_levy_bacv')
print_means(2020,2022, 'dbdrv_levy')
print_means(2023,2024, 'dbdrv_amount')

BACV/DBDRV LEVY
--- calculated_levy_bacv ---
2011 : 151.42
2012 : 152.25
2013 : 154.8
2014 : 164.02
2015 : 176.66
2016 : 185.59
2017 : 202.04
2018 : 223.1
2019 : 243.65

--- dbdrv_levy ---
2020 : 219.28
2021 : 224.56
2022 : 275.34

--- dbdrv_amount ---
2023 : 285.52
2024 : 333.03



In [48]:
print('SOLAR HOT WATER')
print_means(2011,2019, 'solar_hot_water')
print_value_counts(2020,2024, 'solar_hot_water_indicator')

SOLAR HOT WATER
--- solar_hot_water ---
2011 : 0.19
2012 : 0.18
2013 : 0.16
2014 : 0.18
2015 : 0.19
2016 : 0.17
2017 : 0.17
2018 : 0.19
2019 : 0.19

--- solar_hot_water_indicator ---
2020 : N, Y
2021 : N, Y
2022 : N, Y
2023 : N, Y
2024 : N, Y



In [49]:
print('RAINWATER TANK')
print_means(2011,2019, 'rainwater_tank')
print_value_counts(2020,2024, 'rainwater_tank_indicator')

RAINWATER TANK
--- rainwater_tank ---
2011 : 0.1
2012 : 0.09
2013 : 0.08
2014 : 0.08
2015 : 0.08
2016 : 0.07
2017 : 0.07
2018 : 0.07
2019 : 0.08

--- rainwater_tank_indicator ---
2020 : N, Y
2021 : N, Y
2022 : N, Y
2023 : N, Y
2024 : N, Y



In [50]:
print('ESTIMATED PROJECT COST')
print_means(2011,2019, 'est_cost_project')
print_means(2020,2024, 'total_estimated_cost_of_works')

ESTIMATED PROJECT COST
--- est_cost_project ---
2011 : 754957.48
2012 : 640676.33
2013 : 748896.04
2014 : 832217.77
2015 : 1011460.9
2016 : 920012.9
2017 : 1049517.23
2018 : 1460535.99
2019 : 1781637.64

--- total_estimated_cost_of_works ---
2020 : 1683766.95
2021 : 1442301.04
2022 : 2037934.37
2023 : 3156740.78
2024 : 2463148.84



In [51]:
print('BUILDING USE')
print_value_counts(2011,2018, 'basis_zone')
print_value_counts(2019,2024, 'basis_building_use')

BUILDING USE
--- basis_zone ---
2011 : Domestic, Commercial, Retail
2012 : Domestic, Commercial, Retail
2013 : Domestic, Commercial, Retail
2014 : Domestic, Commercial, Retail
2015 : Domestic, Commercial, Retail
2016 : Domestic, Commercial, Retail
2017 : Domestic, Commercial, Retail
2018 : Domestic, Commercial, Retail

--- basis_building_use ---
2019 : Domestic, Commercial, Retail
2020 : Domestic, Commercial, Public Buildings
2021 : Domestic, Commercial, Public Buildings
2022 : Domestic, Commercial, Public Buildings
2023 : Domestic, Commercial, Retail
2024 : Domestic, Commercial, Public Buildings



In [52]:
print('BASIS NOW')
print_value_counts(2011,2024, 'basis_now')

BASIS NOW
--- basis_now ---
2011 : 1, 3, 4
2012 : 1, 3, 4
2013 : 1, 3, 4
2014 : 1, 3, 4
2015 : 1, 4, 3
2016 : 1, 4, 3
2017 : 1, 3, 4
2018 : 1, 3, 4
2019 : 1, 4, 3
2020 : 1, 4, 3
2021 : 1, 4, 3
2022 : 1, 4, 8
2023 : 1, 4, 3
2024 : 1.0, 4.0, 8.0



In [53]:
print_value_counts(2011,2024, 'basis_bca')

--- basis_bca ---
2011 : 1AI, 10A, 1A
2012 : 1AI, 10A, 1A
2013 : 1AI, 10A, 1A
2014 : 1AI, 10A, 1A
2015 : 1AI, 10A, 1A
2016 : 1AI, 10A, 1A
2017 : 1AI, 10A, 1A
2018 : 1AI, 10A, 1A
2019 : 1AI, 1ai, 10A
2020 : 1a(a), 1ai, 10a
2021 : 1a(a), 10a, 10b
2022 : 1a(a), 10a, 10b
2023 : 1a(a), 10a, 10b
2024 : 1a(a), 10a, 10b



In [54]:
print_value_counts(2011,2019, 'basis_ownershipsector')
print_value_counts(2020,2024, 'basis_ownership_sector')

--- basis_ownershipsector ---
2011 : P, S, L
2012 : P, S, L
2013 : P, S, L
2014 : P, S, L
2015 : P, S, L
2016 : P, S, L
2017 : P, S, L
2018 : P, S, L
2019 : P, S, L

--- basis_ownership_sector ---
2020 : P, S, L
2021 : P, S, L
2022 : P, S, L
2023 : P, S, L
2024 : P, S, L



In [55]:
print_value_counts(2011,2019, 'basis_ownerbuilder')
print_value_counts(2020,2024, 'basis_owner_builder')

--- basis_ownerbuilder ---
2011 : 0.0, -1.0, 2.0
2012 : 0.0, -1.0, 2.0
2013 : 0.0, -1.0, 2.0
2014 : 0.0, -1.0, 2.0
2015 : 0.0, -1.0, 2.0
2016 : 0.0, -1.0, 2.0
2017 : 0.0, -1.0, 2.0
2018 : 0.0, -1.0, 2.0
2019 : 0.0, -1.0, 2.0

--- basis_owner_builder ---
2020 : 0.0, -1.0, 2.0
2021 : 0.0, -1.0, 2.0
2022 : 0.0, -1.0, 2.0
2023 : 0.0, -1.0, 2.0
2024 : 0.0, -1.0, 2.0



In [56]:
print_value_counts(2024,2024, 'permit_name_(vba_use_only)')

--- permit_name_(vba_use_only) ---
2024 : P-00638804, P-00638399, P-00232620



In [57]:
print_value_counts(2024,2024, 'site_municipality_code')

--- site_municipality_code ---
2024 : 13.0, 27.0, 75.0



### COLUMN NAME CHANGE

In [58]:
col_name_change = {
    'site_street': 'site_street_name',
    'cleaned_site_street_name': 'site_street_name',
    'site_suburb': 'site_town_suburb',
    'site_pcode': 'site_postcode',
    'site_suburb': 'site_town_suburb',
    'municipal_name': 'site_municipality',
    'builder_suburb': 'builder_town_suburb',
    'builder_pcode': 'builder_postcode',
    'material_code_floor': 'floor_material',
    'material_code_frame': 'frame_material',
    'material_code_roof': 'roof_cladding_material',
    'material_code_walls': 'external_wall_material',
    'dwellings_before_work': 'number_of_existing_dwellings',
    'dwellings_after_work': 'number_of_new_dwellings',
    'number_demolished': 'number_of_dwellings_demolished',
    'floor_area': 'total_floor_area',
    'permit_app_date': 'building_permit_application_date',
    'calculated_levy_bacv': 'dbdrv_amount',
    'dbdrv_levy': 'dbdrv_amount',
    'solar_hot_water': 'solar_hot_water_indicator',
    'rainwater_tank': 'rainwater_tank_indicator',
    'est_cost_project': 'total_estimated_cost_of_works',
    'basis_zone': 'basis_building_use',
    'basis_ownershipsector': 'basis_ownership_sector',
    'basis_ownerbuilder': 'basis_owner_builder'
}

In [59]:
for year in planning_dict:
    planning_dict[year].rename_column_names(col_name_change)

In [60]:
planning_df = pd.concat([planning_dict[year].df for year in planning_dict])

In [61]:
# Cleaning up memory a bit
del planning_dict

In [62]:
planning_df = planning_df.reset_index(drop=True)

In [63]:
round(planning_df.isna().mean().sort_values(ascending=False),5)

site_municipality_code              0.99635
permit_name_(vba_use_only)          0.99635
original_levy_paid                  0.71577
sub_region1                         0.33000
bacv_applicable_flag                0.31445
cost_of_works_domestic              0.31445
multiple_dwellings                  0.31445
calculated_levy_amount              0.31444
reported_levy_amount                0.28424
basis_owner_builder                 0.16019
external_wall_material              0.04952
floor_material                      0.04935
frame_material                      0.04492
roof_cladding_material              0.04444
allotment_area                      0.04391
total_floor_area                    0.04025
number_of_storeys                   0.03668
dbdrv_amount                        0.03411
number_of_dwellings_demolished      0.01763
number_of_new_dwellings             0.01726
number_of_existing_dwellings        0.01609
solar_hot_water_indicator           0.01535
rainwater_tank_indicator        

In [64]:
cols_to_drop = ['permit_name_(vba_use_only)',
                'site_municipality_code',
                'bacv_applicable_flag',
                'cost_of_works_domestic',
                'multiple_dwellings',
                # 'original_levy_paid',  # Do I really care about some $1000 levy paid. It might have correlation with renovation amount?
                # 'calculated_levy_amount',
                # 'reported_levy_amount',
                ]

In [65]:
planning_df = planning_df.drop(columns=cols_to_drop)

### Last bits of cleaning before joining to dwelling data

In [66]:
bool_map = {'N': 0, 'Y': 1}

In [67]:
pd.set_option('future.no_silent_downcasting', True)
planning_df['solar_hot_water_indicator'] = planning_df['solar_hot_water_indicator'].replace(bool_map).astype(float)
planning_df['rainwater_tank_indicator'] = planning_df['rainwater_tank_indicator'].replace(bool_map).astype(float)

In [68]:
planning_df['permit_date'] = pd.to_datetime(pd.to_numeric(planning_df['permit_date'], errors='coerce'), origin='1900-01-01', unit='D')
planning_df['building_permit_application_date'] = pd.to_datetime(pd.to_numeric(planning_df['building_permit_application_date'], errors='coerce'), origin='1900-01-01', unit='D')


In [69]:
planning_df['dbdrv_amount'] = pd.to_numeric(planning_df['dbdrv_amount'], errors='coerce')

In [70]:
planning_df.isna().sum()

permit_stage_number                       1
permit_date                           12357
basis_month_y                             1
basis_month_m                             1
reported_levy_amount                 460799
calculated_levy_amount               509761
reported_cost_of_works                    0
site_street_name                        299
site_town_suburb                          8
site_postcode                             7
site_municipality                        10
municipal_full_name                       8
region                                    8
sub_region                                8
sub_region1                          534995
allotment_area                        71182
builder_town_suburb                    8875
builder_state                          4106
builder_postcode                        325
floor_material                        80011
frame_material                        72826
roof_cladding_material                72046
external_wall_material          

In [71]:
# Remove whitespace
for i in planning_df:
    if planning_df[i].dtype == 'O':
        planning_df[i] = planning_df[i].str.strip()

In [72]:
municipal_full_name_fixes = {
    'Colac Otway, Shire of': 'Colac-Otway, Shire of',
    'Moreland, City of': 'Merri-bek, City of', # Council name change in 2022
    }

site_municipality_fixes = {
    'Colac Otway': 'Colac-Otway',
    'City of Greater Geelong': 'Greater Geelong',
    '1+X2': 'Melbourne',
    'Queenscliff (B)': 'Queenscliffe',
    'Queenscliff': 'Queenscliffe',
    'Mt Sterling Alpine Resort': 'Mt Stirling Alpine Resort',
    'Central Goldfield': 'Central Goldfields',
    'La Trobe': 'Latrobe',
    'Mount alexander': 'Mount Alexander',
    'Mornington': 'Mornington Peninsula',
    'Port Philip': 'Port Phillip',
    'Warnambool': 'Warrnambool',
    'Moreland': 'Merri-bek', # Council name change in 2022
    }

In [73]:
planning_df['municipal_full_name'] = planning_df['municipal_full_name'].replace(municipal_full_name_fixes)
planning_df['site_municipality'] = planning_df['site_municipality'].replace(site_municipality_fixes)

In [74]:
planning_df.pivot_table(columns='municipal_full_name', index='basis_month_y', values='permit_stage_number', aggfunc=len)

municipal_full_name,"Alpine, Shire of","Ararat, Rural City of","Ballarat, City of","Banyule, City of","Bass Coast, Shire of","Baw Baw, Shire of","Bayside, City of","Benalla, Rural City of","Boroondara, City of","Brimbank, City of","Buloke, Shire of","Campaspe, Shire of","Cardinia, Shire of","Casey, City of","Central Goldfields, Shire of","Colac-Otway, Shire of","Corangamite, Shire of","Darebin, City of","Delatite, City of (ceased)","East Gippsland, Shire of",Falls Creek Alpine Resort,"Frankston, City of","Gannawarra, Shire of","Glen Eira, City of","Glenelg, Shire of","Golden Plains, Shire of","Greater Bendigo, City of","Greater Dandenong, City of","Greater Geelong, City of","Greater Shepparton, City of","Hepburn, Shire of","Hindmarsh, Shire of","Hobsons Bay, City of","Horsham, Rural City of","Hume, City of","Indigo, Shire of","Kingston, City of","Knox, City of",Lake Mountain Alpine Resort,"Latrobe, City of","Loddon, Shire of","Macedon Ranges, Shire of","Manningham, City of","Mansfield ,Shire of","Maribyrnong, City of","Maroondah, City of","Melbourne, City of","Melton, Shire of","Merri-bek, City of","Mildura, Rural City of","Mitchell, Shire of","Moira, Shire of","Monash, City of","Moonee Valley, City of","Moorabool, Shire of","Mornington Peninsula, Shire of","Mount Alexander, Shire of","Moyne, Shire of",Mt Baw Baw Alpine Resort,Mt Buller Alpine Resort,Mt Hotham Alpine Resort,Mt Stirling Alpine Resort,"Murrindindi, Shire of","Nillumbik, Shire of","Northern Grampians, Shire of","Port Phillip, City of","Pyrenees, Shire of","Queenscliffe, Borough of","South Gippsland, Shire of","Southern Grampians, Shire of","Stonnington, City of","Strathbogie, Shire of","Surf Coast, Shire of","Swan Hill, Rural City of","Towong, Shire of","Wangaratta, Rural City of","Warrnambool, City of","Wellington, Shire of","West Wimmera, Shire of","Whitehorse, City of","Whittlesea, City of","Wodonga, Rural City of","Wyndham, City of","Yarra Ranges, Shire of","Yarra, City of","Yarriambiack, Shire of"
basis_month_y,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1
2009.0,371.0,218.0,2181.0,1450.0,1203.0,1489.0,1808.0,289.0,2796.0,2732.0,88.0,763.0,2437.0,4809.0,272.0,521.0,364.0,1954.0,,1054.0,8.0,1780.0,224.0,1987.0,378.0,543.0,2507.0,1433.0,4768.0,1399.0,360.0,102.0,1174.0,506.0,3024.0,364.0,2099.0,1866.0,1.0,1650.0,156.0,1063.0,1153.0,271.0,1183.0,1426.0,2756.0,3094.0,1853.0,1114.0,1044.0,708.0,2371.0,1736.0,745.0,3581.0,469.0,413.0,1.0,4.0,2.0,1.0,943.0,893.0,220.0,1084.0,149.0,139.0,725.0,365.0,1659.0,236.0,985.0,418.0,107.0,710.0,636.0,1130.0,67.0,2061.0,4322.0,925.0,5874.0,2385.0,1318.0,101.0
2010.0,377.0,193.0,2159.0,1602.0,1401.0,1516.0,2044.0,273.0,3360.0,2453.0,92.0,747.0,2577.0,4678.0,270.0,501.0,353.0,2022.0,,1228.0,5.0,2479.0,191.0,2229.0,390.0,622.0,2641.0,1764.0,4994.0,1381.0,441.0,93.0,1343.0,535.0,2970.0,403.0,2166.0,1965.0,,1505.0,168.0,1098.0,1413.0,274.0,1202.0,1493.0,2816.0,3080.0,2004.0,1105.0,1127.0,784.0,2602.0,1842.0,819.0,4079.0,456.0,402.0,2.0,6.0,3.0,,786.0,987.0,175.0,1269.0,174.0,159.0,808.0,322.0,1695.0,297.0,1104.0,505.0,105.0,600.0,656.0,1166.0,59.0,2423.0,4726.0,890.0,7014.0,2325.0,1319.0,117.0
2011.0,309.0,184.0,2211.0,1432.0,1223.0,1350.0,1714.0,263.0,2917.0,2135.0,138.0,817.0,2516.0,4118.0,254.0,455.0,302.0,1821.0,,1114.0,4.0,1734.0,170.0,2010.0,299.0,631.0,2788.0,1804.0,4767.0,1237.0,480.0,75.0,1097.0,385.0,2683.0,374.0,1856.0,1762.0,,1207.0,140.0,1028.0,1287.0,331.0,1217.0,1452.0,2680.0,2658.0,1926.0,1026.0,1102.0,708.0,2364.0,1749.0,682.0,3678.0,449.0,365.0,2.0,4.0,3.0,,514.0,762.0,180.0,1194.0,145.0,120.0,773.0,339.0,1517.0,254.0,1071.0,376.0,90.0,513.0,693.0,1174.0,63.0,2526.0,4517.0,776.0,5041.0,2152.0,1249.0,96.0
2012.0,380.0,158.0,1995.0,1538.0,1061.0,1182.0,1568.0,233.0,2753.0,1669.0,120.0,750.0,1933.0,4182.0,279.0,447.0,296.0,1695.0,,1009.0,10.0,1689.0,205.0,1717.0,306.0,683.0,3027.0,1624.0,4794.0,1115.0,396.0,83.0,1180.0,348.0,2946.0,316.0,1768.0,1618.0,,1108.0,160.0,949.0,1124.0,284.0,1108.0,1562.0,2552.0,2102.0,1940.0,1044.0,926.0,567.0,2325.0,1641.0,818.0,3382.0,437.0,381.0,3.0,16.0,7.0,,407.0,779.0,176.0,1126.0,201.0,118.0,703.0,310.0,1541.0,258.0,963.0,350.0,118.0,512.0,613.0,1002.0,76.0,2258.0,3862.0,785.0,3776.0,1886.0,1272.0,105.0
2013.0,372.0,193.0,1999.0,1561.0,1084.0,1161.0,1740.0,244.0,3126.0,1719.0,184.0,691.0,1834.0,4500.0,283.0,439.0,245.0,1915.0,,922.0,14.0,1584.0,181.0,1914.0,357.0,901.0,2434.0,1699.0,5136.0,963.0,398.0,111.0,1195.0,408.0,3180.0,357.0,1707.0,1692.0,,1043.0,196.0,1031.0,1221.0,246.0,1203.0,1662.0,2858.0,2317.0,2132.0,967.0,870.0,617.0,2443.0,1737.0,676.0,3448.0,407.0,346.0,,11.0,,,371.0,843.0,192.0,1257.0,194.0,128.0,674.0,331.0,1576.0,263.0,916.0,366.0,111.0,459.0,524.0,946.0,46.0,2437.0,3448.0,756.0,3519.0,1980.0,1394.0,116.0
2014.0,283.0,172.0,2075.0,1607.0,1101.0,1205.0,1931.0,264.0,3520.0,1715.0,126.0,697.0,2192.0,4920.0,256.0,416.0,312.0,2131.0,,1049.0,9.0,1481.0,198.0,2128.0,330.0,611.0,3006.0,1787.0,5491.0,1105.0,440.0,106.0,1332.0,400.0,3243.0,335.0,1920.0,1832.0,,1228.0,179.0,1110.0,1404.0,282.0,1169.0,1707.0,3213.0,2501.0,2135.0,1057.0,988.0,658.0,2784.0,2015.0,718.0,3728.0,494.0,374.0,,4.0,13.0,,349.0,769.0,173.0,1352.0,171.0,118.0,678.0,284.0,1788.0,303.0,922.0,356.0,102.0,524.0,568.0,927.0,56.0,2575.0,3765.0,927.0,4243.0,2091.0,1458.0,128.0
2015.0,340.0,179.0,2077.0,1631.0,1125.0,1243.0,2094.0,266.0,3452.0,1791.0,118.0,632.0,2351.0,5387.0,289.0,479.0,328.0,2198.0,,984.0,16.0,1429.0,200.0,2084.0,305.0,634.0,2615.0,1866.0,5460.0,1161.0,453.0,72.0,1370.0,430.0,3544.0,376.0,2082.0,1933.0,,1217.0,156.0,1030.0,1573.0,245.0,1280.0,1781.0,3133.0,2710.0,2324.0,1078.0,959.0,687.0,2975.0,1879.0,778.0,3807.0,430.0,390.0,,7.0,3.0,,390.0,795.0,186.0,1341.0,165.0,128.0,687.0,300.0,1820.0,296.0,1054.0,411.0,95.0,498.0,523.0,988.0,55.0,2875.0,3983.0,982.0,4664.0,2196.0,1372.0,101.0
2016.0,321.0,166.0,1955.0,1838.0,1235.0,1294.0,2115.0,257.0,3230.0,1612.0,119.0,230.0,2525.0,6077.0,239.0,475.0,301.0,2032.0,,1090.0,11.0,1412.0,188.0,2121.0,298.0,616.0,2389.0,1711.0,5646.0,1145.0,868.0,80.0,1404.0,407.0,3872.0,357.0,2140.0,2088.0,1.0,1130.0,140.0,1052.0,1564.0,303.0,1237.0,1723.0,3289.0,3088.0,2351.0,1060.0,1214.0,724.0,2870.0,1898.0,738.0,4082.0,414.0,362.0,,9.0,4.0,,341.0,873.0,209.0,1329.0,179.0,125.0,720.0,301.0,2047.0,267.0,1019.0,364.0,89.0,506.0,566.0,985.0,61.0,2749.0,3639.0,991.0,5523.0,2310.0,1318.0,105.0
2017.0,359.0,174.0,1984.0,1679.0,1388.0,1458.0,2050.0,330.0,3034.0,1761.0,107.0,221.0,3121.0,6512.0,189.0,476.0,308.0,1976.0,,1218.0,13.0,1582.0,153.0,2274.0,296.0,592.0,2401.0,1707.0,5776.0,1175.0,988.0,72.0,1327.0,314.0,4631.0,376.0,2347.0,1997.0,5.0,1021.0,174.0,1178.0,1747.0,272.0,1217.0,1467.0,3402.0,3308.0,2098.0,1090.0,1111.0,689.0,2899.0,1859.0,789.0,3972.0,451.0,344.0,2.0,12.0,5.0,,437.0,847.0,179.0,1283.0,167.0,131.0,742.0,294.0,1851.0,305.0,1154.0,393.0,101.0,542.0,592.0,997.0,67.0,2496.0,3377.0,1009.0,6615.0,2276.0,1425.0,124.0
2018.0,373.0,184.0,2255.0,1694.0,1498.0,1645.0,2021.0,307.0,2705.0,1638.0,115.0,401.0,2581.0,6512.0,177.0,480.0,317.0,1807.0,,1231.0,7.0,1546.0,191.0,2082.0,303.0,641.0,2321.0,1629.0,6074.0,1149.0,778.0,81.0,1345.0,306.0,4799.0,405.0,2261.0,1869.0,1.0,1217.0,189.0,1190.0,1653.0,314.0,1212.0,1478.0,3372.0,3720.0,1964.0,1117.0,1331.0,669.0,2866.0,1894.0,877.0,4067.0,527.0,333.0,1.0,15.0,3.0,,386.0,827.0,191.0,1213.0,174.0,122.0,731.0,310.0,1857.0,330.0,1337.0,371.0,86.0,521.0,554.0,1006.0,73.0,2341.0,3455.0,1059.0,6939.0,2153.0,1409.0,104.0


In [75]:
planning_df.pivot_table(columns='site_municipality', index='basis_month_y', values='permit_stage_number', aggfunc=len)

site_municipality,Alpine,Ararat,Ballarat,Banyule,Bass Coast,Baw Baw,Bayside,Benalla,Boroondara,Brimbank,Buloke,Campaspe,Cardinia,Casey,Central Goldfields,Colac-Otway,Corangamite,Darebin,East Gippsland,Falls Creek Alpine Resort,Frankston,Gannawarra,Glen Eira,Glenelg,Golden Plains,Greater Bendigo,Greater Dandenong,Greater Geelong,Greater Shepparton,Hepburn,Hindmarsh,Hobsons Bay,Horsham,Hume,Indigo,Kingston,Knox,Lake Mountain Alpine Resort,Latrobe,Loddon,Macedon Ranges,Manningham,Mansfield,Maribyrnong,Maroondah,Melbourne,Melton,Merri-bek,Mildura,Mitchell,Moira,Monash,Moonee Valley,Moorabool,Mornington Peninsula,Mount Alexander,Moyne,Mt Baw Baw Alpine Resort,Mt Buller Alpine Resort,Mt Hotham Alpine Resort,Mt Stirling Alpine Resort,Murrindindi,Nillumbik,Northern Grampians,Port Phillip,Pyrenees,Queenscliffe,South Gippsland,Southern Grampians,Stonnington,Strathbogie,Surf Coast,Swan Hill,Towong,Wangaratta,Warrnambool,Wellington,West Wimmera,Whitehorse,Whittlesea,Wodonga,Wyndham,Yarra,Yarra Ranges,Yarriambiack
basis_month_y,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1
2009.0,371.0,218.0,2181.0,1450.0,1203.0,1489.0,1808.0,289.0,2796.0,2732.0,88.0,763.0,2437.0,4809.0,272.0,521.0,364.0,1954.0,1054.0,8.0,1780.0,224.0,1987.0,378.0,543.0,2507.0,1433.0,4768.0,1399.0,360.0,102.0,1174.0,506.0,3024.0,364.0,2099.0,1866.0,1.0,1650.0,156.0,1063.0,1153.0,271.0,1183.0,1426.0,2756.0,3094.0,1853.0,1114.0,1044.0,708.0,2371.0,1736.0,745.0,3581.0,469.0,413.0,1.0,4.0,2.0,1.0,943.0,893.0,220.0,1084.0,149.0,139.0,725.0,365.0,1659.0,236.0,985.0,418.0,107.0,710.0,636.0,1130.0,67.0,2061.0,4322.0,925.0,5874.0,1318.0,2385.0,101.0
2010.0,377.0,193.0,2159.0,1602.0,1401.0,1516.0,2044.0,273.0,3360.0,2453.0,92.0,747.0,2577.0,4678.0,270.0,501.0,353.0,2022.0,1228.0,5.0,2479.0,191.0,2229.0,390.0,622.0,2641.0,1764.0,4994.0,1381.0,441.0,93.0,1343.0,535.0,2970.0,403.0,2166.0,1965.0,,1505.0,168.0,1098.0,1413.0,274.0,1202.0,1493.0,2816.0,3080.0,2004.0,1105.0,1127.0,784.0,2602.0,1842.0,819.0,4079.0,456.0,402.0,2.0,6.0,3.0,,786.0,987.0,175.0,1269.0,174.0,159.0,808.0,322.0,1695.0,297.0,1104.0,505.0,105.0,600.0,656.0,1166.0,59.0,2423.0,4726.0,890.0,7014.0,1319.0,2325.0,117.0
2011.0,309.0,184.0,2211.0,1432.0,1223.0,1350.0,1714.0,263.0,2917.0,2135.0,138.0,817.0,2516.0,4118.0,254.0,455.0,302.0,1821.0,1114.0,4.0,1734.0,170.0,2010.0,299.0,631.0,2788.0,1804.0,4767.0,1237.0,480.0,75.0,1097.0,385.0,2683.0,374.0,1856.0,1762.0,,1207.0,140.0,1028.0,1287.0,331.0,1217.0,1452.0,2680.0,2658.0,1926.0,1026.0,1102.0,708.0,2364.0,1749.0,682.0,3678.0,449.0,365.0,2.0,4.0,3.0,,514.0,762.0,180.0,1194.0,145.0,120.0,773.0,339.0,1517.0,254.0,1071.0,376.0,90.0,513.0,693.0,1174.0,63.0,2526.0,4517.0,776.0,5041.0,1249.0,2152.0,96.0
2012.0,380.0,158.0,1995.0,1538.0,1061.0,1182.0,1568.0,233.0,2753.0,1669.0,120.0,750.0,1933.0,4182.0,279.0,447.0,296.0,1695.0,1009.0,10.0,1689.0,205.0,1717.0,306.0,683.0,3027.0,1624.0,4794.0,1115.0,396.0,83.0,1180.0,348.0,2946.0,316.0,1768.0,1618.0,,1108.0,160.0,949.0,1124.0,284.0,1108.0,1562.0,2552.0,2102.0,1940.0,1044.0,926.0,567.0,2325.0,1641.0,818.0,3382.0,437.0,381.0,3.0,16.0,7.0,,407.0,779.0,176.0,1126.0,201.0,118.0,703.0,310.0,1541.0,258.0,963.0,350.0,118.0,512.0,613.0,1002.0,76.0,2258.0,3862.0,785.0,3776.0,1272.0,1886.0,105.0
2013.0,372.0,193.0,1999.0,1561.0,1084.0,1161.0,1740.0,244.0,3126.0,1719.0,184.0,691.0,1834.0,4500.0,283.0,439.0,245.0,1915.0,922.0,14.0,1584.0,181.0,1914.0,357.0,901.0,2434.0,1699.0,5136.0,963.0,398.0,111.0,1195.0,408.0,3180.0,357.0,1707.0,1692.0,,1043.0,196.0,1031.0,1221.0,246.0,1203.0,1662.0,2858.0,2317.0,2132.0,967.0,870.0,617.0,2443.0,1737.0,676.0,3448.0,407.0,346.0,,11.0,,,371.0,843.0,192.0,1257.0,194.0,128.0,674.0,331.0,1576.0,263.0,916.0,366.0,111.0,459.0,524.0,946.0,46.0,2437.0,3448.0,756.0,3519.0,1394.0,1980.0,116.0
2014.0,283.0,172.0,2075.0,1607.0,1101.0,1205.0,1931.0,264.0,3520.0,1715.0,126.0,697.0,2192.0,4920.0,256.0,416.0,312.0,2131.0,1049.0,9.0,1481.0,198.0,2128.0,330.0,611.0,3006.0,1787.0,5491.0,1105.0,440.0,106.0,1332.0,400.0,3243.0,335.0,1920.0,1832.0,,1228.0,179.0,1110.0,1404.0,282.0,1169.0,1707.0,3213.0,2501.0,2135.0,1057.0,988.0,658.0,2784.0,2015.0,718.0,3728.0,494.0,374.0,,4.0,13.0,,349.0,769.0,173.0,1352.0,171.0,118.0,678.0,284.0,1788.0,303.0,922.0,356.0,102.0,524.0,568.0,927.0,56.0,2575.0,3765.0,927.0,4243.0,1458.0,2091.0,128.0
2015.0,340.0,179.0,2077.0,1631.0,1125.0,1243.0,2094.0,266.0,3452.0,1791.0,118.0,632.0,2351.0,5387.0,289.0,479.0,328.0,2198.0,984.0,16.0,1429.0,200.0,2084.0,305.0,634.0,2615.0,1866.0,5460.0,1161.0,453.0,72.0,1370.0,430.0,3544.0,376.0,2082.0,1933.0,,1217.0,156.0,1030.0,1573.0,245.0,1280.0,1781.0,3133.0,2710.0,2324.0,1078.0,959.0,687.0,2975.0,1879.0,778.0,3807.0,430.0,390.0,,7.0,3.0,,390.0,795.0,186.0,1341.0,165.0,128.0,687.0,300.0,1820.0,296.0,1054.0,411.0,95.0,498.0,523.0,988.0,55.0,2875.0,3983.0,982.0,4664.0,1372.0,2196.0,101.0
2016.0,321.0,166.0,1955.0,1838.0,1235.0,1294.0,2115.0,257.0,3230.0,1612.0,119.0,230.0,2525.0,6077.0,239.0,475.0,301.0,2032.0,1090.0,11.0,1412.0,188.0,2121.0,298.0,616.0,2389.0,1711.0,5646.0,1145.0,868.0,80.0,1404.0,407.0,3872.0,357.0,2140.0,2088.0,1.0,1130.0,140.0,1052.0,1564.0,303.0,1237.0,1723.0,3289.0,3088.0,2351.0,1060.0,1214.0,724.0,2870.0,1898.0,738.0,4082.0,414.0,362.0,,9.0,4.0,,341.0,873.0,209.0,1329.0,179.0,125.0,720.0,301.0,2047.0,267.0,1019.0,364.0,89.0,506.0,566.0,985.0,61.0,2749.0,3639.0,991.0,5523.0,1318.0,2310.0,105.0
2017.0,359.0,174.0,1984.0,1679.0,1388.0,1458.0,2050.0,330.0,3034.0,1761.0,107.0,221.0,3121.0,6512.0,189.0,476.0,308.0,1976.0,1218.0,13.0,1582.0,153.0,2274.0,296.0,592.0,2401.0,1707.0,5776.0,1175.0,988.0,72.0,1327.0,314.0,4631.0,376.0,2347.0,1997.0,5.0,1021.0,174.0,1178.0,1747.0,272.0,1217.0,1467.0,3402.0,3308.0,2098.0,1090.0,1111.0,689.0,2899.0,1859.0,789.0,3972.0,451.0,344.0,2.0,12.0,5.0,,437.0,847.0,179.0,1283.0,167.0,131.0,742.0,294.0,1851.0,305.0,1154.0,393.0,101.0,542.0,592.0,997.0,67.0,2496.0,3377.0,1009.0,6615.0,1425.0,2276.0,124.0
2018.0,373.0,184.0,2255.0,1694.0,1498.0,1645.0,2021.0,307.0,2705.0,1638.0,115.0,401.0,2581.0,6512.0,177.0,480.0,317.0,1807.0,1231.0,7.0,1546.0,191.0,2082.0,303.0,641.0,2321.0,1629.0,6074.0,1149.0,778.0,81.0,1345.0,306.0,4799.0,405.0,2261.0,1869.0,1.0,1217.0,189.0,1190.0,1653.0,314.0,1212.0,1478.0,3372.0,3720.0,1964.0,1117.0,1331.0,669.0,2866.0,1894.0,877.0,4067.0,527.0,333.0,1.0,15.0,3.0,,386.0,827.0,191.0,1213.0,174.0,122.0,731.0,310.0,1857.0,330.0,1337.0,371.0,86.0,521.0,554.0,1006.0,73.0,2341.0,3455.0,1059.0,6939.0,1409.0,2153.0,104.0


In [76]:
# Need to still clean the site_street_name and the site_town_suburb
planning_df['site_street_name']

0           ROBERTS STREET
1              WARREN ROAD
2           - JANET STREET
3          - WAVERLEY ROAD
4            - HALL STREET
                ...       
1621182     SANGSTERS ROAD
1621183     Dandenong Road
1621184     Treasury Place
1621185    Lower Esplanade
1621186                NaN
Name: site_street_name, Length: 1621187, dtype: object

In [77]:
planning_df['site_street_name'] = planning_df['site_street_name'].str.replace('- ', '').str.strip().str.upper()

In [78]:
with open('street_type_map.json', 'r') as json_file:
    loaded_street_types = json.load(json_file)

In [79]:
# Function to replace street types
def replace_street_type(street):
    if pd.isna(street):
        return street

    words = street.split()
    last_word = words[-1]
    
    # Check if the last word is a street type shortcode
    if last_word in loaded_street_types:
        words[-1] = loaded_street_types[last_word]
    
    return ' '.join(words)

# Apply the function to the 'street' column
planning_df['site_street_name'] = planning_df['site_street_name'].apply(replace_street_type)

# Display the DataFrame
planning_df

Unnamed: 0,permit_stage_number,permit_date,basis_month_y,basis_month_m,reported_levy_amount,calculated_levy_amount,reported_cost_of_works,site_street_name,site_town_suburb,site_postcode,site_municipality,municipal_full_name,region,sub_region,sub_region1,allotment_area,builder_town_suburb,builder_state,builder_postcode,floor_material,frame_material,roof_cladding_material,external_wall_material,number_of_existing_dwellings,number_of_new_dwellings,number_of_storeys,number_of_dwellings_demolished,total_floor_area,building_permit_application_date,dbdrv_amount,solar_hot_water_indicator,rainwater_tank_indicator,total_estimated_cost_of_works,basis_building_use,basis_now,basis_bca,basis_ownership_sector,basis_owner_builder,original_levy_paid
0,1.0,2009-01-21,2009.0,1.0,0.0,76.8,60000,ROBERTS STREET,ESSENDON,,Moonee Valley,"Moonee Valley, City of",Metropolitan,Inner Melbourne,Inner West,507.0,PRAHRAN,VIC,,20.0,40.0,60.0,40.0,1.0,1.0,2.0,0.0,288.0,2008-12-17,38.4,0.0,0.0,636752.0,Domestic,,1A,P,0.0,
1,0.0,2009-01-25,2009.0,1.0,19.2,19.2,15000,WARREN ROAD,MORDIALLOC,,Kingston,"Kingston, City of",Metropolitan,Outer Melbourne,South Eastern,900.0,RESERVOIR,VIC,,20.0,40.0,10.0,12.0,1.0,0.0,0.0,1.0,0.0,2009-01-24,9.6,0.0,0.0,15000.0,Domestic,,1A,S,0.0,
2,0.0,2009-01-31,2009.0,1.0,98.4,98.4,76875,JANET STREET,BLACKBURN,,Whitehorse,"Whitehorse, City of",Metropolitan,Inner Melbourne,Mid East,0.0,PRESTON,VIC,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2008-11-19,49.2,0.0,0.0,76875.0,Public Buildings,,9B,S,,
3,0.0,2009-01-31,2009.0,1.0,128.0,128.0,100000,WAVERLEY ROAD,MALVERN EAST,,Stonnington,"Stonnington, City of",Metropolitan,Inner Melbourne,Inner East,0.0,TULLAMARINE,VIC,,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,165.0,2009-01-18,64.0,0.0,0.0,0.0,Retail,,6A,P,,
4,0.0,2009-01-23,2009.0,1.0,108.8,108.8,85000,HALL STREET,MOONEE PONDS,,Moonee Valley,"Moonee Valley, City of",Metropolitan,Inner Melbourne,Inner West,0.0,ASHBURTON,VIC,,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2009-01-21,54.4,0.0,0.0,85000.0,Retail,,6A,P,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1621182,0.0,2024-01-14,2024.0,1.0,,,22100,SANGSTERS ROAD,WODONGA,,Wodonga,"Wodonga, Rural City of",Rural,North East,,51382.0,STAGHORN FLAT,VIC,,20.0,60.0,60.0,60.0,0.0,0.0,0.0,0.0,31.0,2023-12-14,,0.0,0.0,22100.0,Domestic,,10a,N,0.0,0.0
1621183,0.0,2024-01-27,2024.0,1.0,,,35000,DANDENONG ROAD,Chadstone,,Monash,"Monash, City of",Metropolitan,Inner Melbourne,Mid East,0.0,Burleigh Heads,QLD,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2024-01-21,,0.0,0.0,35000.0,Retail,,6,P,,0.0
1621184,0.0,2024-01-18,2024.0,1.0,,,3239428,TREASURY PLACE,East Melbourne,,Melbourne,"Melbourne, City of",Metropolitan,Inner Melbourne,Melbourne,,North Melbourne,VIC,,0.0,0.0,0.0,0.0,,,,,,2024-01-17,,0.0,0.0,3239428.0,Commercial,,5,N,,0.0
1621185,3.0,2024-01-24,2024.0,1.0,,,7600000,LOWER ESPLANADE,St Kilda,,Port Phillip,"Port Phillip, City of",Metropolitan,Inner Melbourne,Central Bay,11880.0,Warrandyte South,VIC,,80.0,60.0,60.0,80.0,0.0,0.0,3.0,0.0,2392.0,2023-09-28,,0.0,0.0,10000000.0,Public Buildings,,9b,N,,0.0


In [80]:
i = 40000
planning_df['site_street_name'][i:i+30]

40000             CEDAR STREET
40001             CEDAR STREET
40002             CEDAR STREET
40003          CEDARWOOD COURT
40004          CEDARWOOD DRIVE
40005          CEDARWOOD DRIVE
40006        CELEBRATION DRIVE
40007        CELEBRATION DRIVE
40008            CEMETERY LANE
40009            CEMETERY ROAD
40010            CEMETERY ROAD
40011          CENTENARY DRIVE
40012         CENTENARY STREET
40013           CENTRAL AVENUE
40014        CENTRAL BOULEVARD
40015      CENTRAL KIALLA ROAD
40016      CENTRAL PARK AVENUE
40017             CENTRAL ROAD
40018           CENTRAL STREET
40019    CENTRE DANDENONG ROAD
40020    CENTRE DANDENONG ROAD
40021              CENTRE ROAD
40022              CENTRE ROAD
40023              CENTRE ROAD
40024              CENTRE ROAD
40025              CENTRE ROAD
40026              CENTRE ROAD
40027              CENTRE ROAD
40028              CENTRE ROAD
40029              CENTRE ROAD
Name: site_street_name, dtype: object

In [81]:
# Some annoying streets with numbers but mostly pre-2019 so we won't worry
numbers_in_street_name = planning_df[planning_df['site_street_name'].str.contains(r'\d', na=False)]
numbers_in_street_name['basis_month_y'].value_counts()

basis_month_y
2010.0    753
2012.0    729
2011.0    727
2013.0    722
2014.0    684
2018.0    302
2017.0     19
2022.0     10
2021.0      7
2023.0      6
2020.0      5
2019.0      2
2015.0      1
2016.0      1
2024.0      1
Name: count, dtype: int64

In [82]:
no_nums_or_letters_street_df = planning_df[planning_df['site_street_name'].str.contains(r'[^a-zA-Z0-9\s-]', na=False)]
no_nums_or_letters_street_df['basis_month_y'].value_counts()

basis_month_y
2010.0    3057
2011.0    2926
2014.0    2447
2013.0    2439
2012.0    2438
2009.0    2363
2018.0    2252
2015.0    2238
2016.0    2142
2017.0    1845
2021.0    1548
2019.0    1468
2020.0    1416
2022.0    1034
2023.0     770
2024.0      30
Name: count, dtype: int64

In [83]:
no_nums_or_letters_street_df

Unnamed: 0,permit_stage_number,permit_date,basis_month_y,basis_month_m,reported_levy_amount,calculated_levy_amount,reported_cost_of_works,site_street_name,site_town_suburb,site_postcode,site_municipality,municipal_full_name,region,sub_region,sub_region1,allotment_area,builder_town_suburb,builder_state,builder_postcode,floor_material,frame_material,roof_cladding_material,external_wall_material,number_of_existing_dwellings,number_of_new_dwellings,number_of_storeys,number_of_dwellings_demolished,total_floor_area,building_permit_application_date,dbdrv_amount,solar_hot_water_indicator,rainwater_tank_indicator,total_estimated_cost_of_works,basis_building_use,basis_now,basis_bca,basis_ownership_sector,basis_owner_builder,original_levy_paid
27,0.0,2009-02-01,2009.0,1.0,709.40,709.40,554240,A'BECKETT ROAD,NARRE WARREN,,Casey,"Casey, City of",Metropolitan,Outer Melbourne,South Eastern,0.0,SOUTH MELBOURNE,VIC,,20.0,60.0,10.0,12.0,0.0,1.0,2.0,0.0,777.0,2009-01-21,354.7136,1.0,0.0,554240.0,Domestic,,1A,P,0.0,
28,0.0,2008-12-13,2009.0,1.0,0.00,0.00,5900,A'BECKETT STREET,RUSHWORTH,,Campaspe,"Campaspe, Shire of",Rural,North Central,,0.0,MURCHISON,VIC,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2008-12-11,0.0000,0.0,0.0,0.0,Domestic,,1AI,P,0.0,
29,0.0,2009-02-01,2009.0,1.0,2304.00,2304.00,1800000,A'BECKETT STREET,INVERLOCH,,Bass Coast,"Bass Coast, Shire of",Rural,Gippsland,,3709.0,CLAYTON SOUTH,VIC,,20.0,60.0,60.0,30.0,0.0,0.0,1.0,0.0,1039.0,2008-08-13,1152.0001,0.0,0.0,1800000.0,Public Buildings,,9B,L,,
30,1.0,2009-01-09,2009.0,1.0,89.60,89.60,70000,A'BECKETT STREET,MELBOURNE,,Melbourne,"Melbourne, City of",Metropolitan,Inner Melbourne,Melbourne,0.0,PORT MELBOURNE,VIC,,12.0,60.0,60.0,11.0,0.0,0.0,2.0,0.0,0.0,2008-12-20,44.8000,0.0,0.0,53500000.0,Residential,,,P,,
37,0.0,2009-01-17,2009.0,1.0,355.05,355.05,277384,A'BROWNES ROAD,TARWIN,,South Gippsland,"South Gippsland, Shire of",Rural,Gippsland,,3.0,INVERLOCH,VIC,,20.0,40.0,60.0,12.0,0.0,1.0,1.0,0.0,321.0,2009-01-14,177.5258,1.0,1.0,277384.0,Domestic,,1AI,P,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620328,2.0,2024-01-14,2024.0,1.0,,,1099357,"EDGECOMBE ROAD,",Kyneton,,Macedon Ranges,"Macedon Ranges, Shire of",Rural,North Central,,3675.0,Alfredton,VIC,,20.0,60.0,60.0,20.0,0.0,0.0,1.0,0.0,1846.0,2023-05-25,703.5900,0.0,1.0,1099357.0,Commercial,,7b,P,,1407.18
1620347,0.0,2024-01-14,2024.0,1.0,,,1133489,E. GIBBONS ROAD,Hamilton,,Southern Grampians,"Southern Grampians, Shire of",Rural,South West,,13702.0,Hamilton,VIC,,20.0,40.0,60.0,40.0,0.0,1.0,1.0,0.0,349.0,2023-12-03,725.4300,0.0,1.0,1133489.0,Domestic,,1a(a),P,0.0,1450.86
1620379,0.0,2024-01-26,2024.0,1.0,,,1207750,BRADSHAW STREET (AKA MAGPIE STREET),Golden Point,,Ballarat,"Ballarat, City of",Rural,North West,,257672.0,Cardigan,VIC,,20.0,40.0,60.0,60.0,0.0,0.0,1.0,0.0,118.0,2024-01-05,772.9600,0.0,0.0,1207750.0,Retail,,6a,S,,1545.92
1620636,4.0,2024-01-10,2024.0,1.0,,,4080861,"LEICESTER STREET,",Melbourne,,Melbourne,"Melbourne, City of",Metropolitan,Inner Melbourne,Melbourne,242.0,Melbourne,VIC,,,,,,0.0,0.0,,0.0,,2023-11-24,2611.7500,0.0,0.0,24094233.0,Public Buildings,,9b,P,,5223.50


In [84]:
planning_df['site_street_name'] = [re.sub(',$', '', str(c).split(' (')[0]).replace('.', '') if pd.notna(c) else c for c in planning_df['site_street_name']]

In [85]:
no_nums_or_letters_suburb_df = planning_df[planning_df['site_town_suburb'].str.contains(r'[^a-zA-Z0-9\s]', na=False)]

# Not worth worrying about for now but will create a small systematic bias
no_nums_or_letters_suburb_df['basis_month_y'].value_counts()

basis_month_y
2022.0    60
2021.0    52
2023.0    34
2020.0    32
2019.0     9
2011.0     7
2014.0     4
2017.0     3
2009.0     2
2012.0     2
2013.0     2
2010.0     1
2015.0     1
2016.0     1
2018.0     1
Name: count, dtype: int64

In [86]:
# Can clean these up if needed
planning_df['site_town_suburb'].value_counts()

site_town_suburb
MELBOURNE          21476
POINT COOK         16439
TARNEIT            13867
PAKENHAM           12969
CRAIGIEBURN        12920
                   ...  
Yarravilee West        1
Willams Landing        1
Diggest Rest           1
BEACOSNFIELD           1
Murphys Creek          1
Name: count, Length: 6770, dtype: int64

In [87]:
planning_df['site_town_suburb'] = planning_df['site_town_suburb'].str.upper()

In [90]:
planning_df.to_parquet('data/planning_df.parquet')