In [1]:
import re
import numpy as np
import pandas as pd
import re
import geopandas as gpd
import os



In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
spark = (
    SparkSession.builder.appName("MAST30034 Project 2 Preprocessing")
    .config("spark.driver.memory", '4g')
    .config("spark.executor.memory", '8g')
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.sql.parquet.enableVectorizedReader","false")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.parquet.writeLegacyFormat", 'true')
    .getOrCreate()
)

22/10/05 16:36:25 WARN Utils: Your hostname, DESKTOP-3NQ3PQI resolves to a loopback address: 127.0.1.1; using 172.17.23.167 instead (on interface eth0)
22/10/05 16:36:25 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/05 16:36:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Mapping Postcode to ABS Postal Areas

In [3]:
def postcode_to_str(col):
    return col.astype(str).str.zfill(4)

In [None]:
# Read in data
postal_areas_gdf = gpd.read_file('../data/raw/postcodes/abs_postal_areas.zip')
consumer_details_df = pd.read_parquet('../data/curated/cleaned_consumers.parquet')
postcode_df = pd.read_csv('../data/raw/postcodes/postcodes.csv').drop_duplicates('postcode')

consumer_details_df['postcode'] = postcode_to_str(consumer_details_df['postcode'])
postcode_df['postcode'] = postcode_to_str(postcode_df['postcode'])

# Convert postcode dataframe to geodataframe
postcode_gdf = gpd.GeoDataFrame(
    postcode_df, geometry=gpd.points_from_xy(postcode_df['long'], postcode_df['lat'])
)
postcode_gdf.crs = postal_areas_gdf.crs

# Get list of postcodes not listed as abs postal areas and filter geodataframe to just these postcodes
unmapped = consumer_details_df[~consumer_details_df['postcode'].astype(str).str.zfill(4).isin(postal_areas_gdf['POA_CODE21'])]['postcode'].unique()
postcodes_gdf = postcode_gdf[postcode_gdf['postcode'].isin(unmapped)]

# Spatially join unmapped postcodes and abs postal areas
postcode_poa_gdf = postcodes_gdf.sjoin(postal_areas_gdf, how = 'inner')

# Remove and rename columns 
postcode_poa_df = postcode_poa_gdf[['postcode', 'POA_CODE21']]
postcode_poa_df = postcode_poa_df.rename(columns = {'POA_CODE21' : 'poa'})

# Combine abs mapped postcodes with unmapped postcodes
postcode_poa_df = pd.concat([postcode_poa_df, postal_areas_gdf[['POA_CODE21', 'POA_CODE21']].set_axis(['postcode', 'poa'], axis = 1)], ignore_index = True).reset_index(drop = True)

In [None]:
output_dir = '../data/curated/census/'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

postcode_poa_df.to_parquet(output_dir + 'postcode_poa.parquet', index = False)

All but 2 postcodes could be mapped to abs postal areas. Niether of these could be found in the Australia post website. https://postcodes-australia.com/postcodes/6958 says 6958 is a Western Australian postcode reserved for non standard use<br><br>

In [None]:
removed_consumers = consumer_details_df[~consumer_details_df['postcode'].astype(str).str.zfill(4).isin(postcode_poa_df['postcode'])]
len(removed_consumers), removed_consumers['postcode'].unique()

# Age/Gender Weighting

Read in data

In [17]:

age_df = pd.read_parquet(
    '../data/curated/census/age_data.parquet'
)

In [8]:
age_df

Unnamed: 0,age_yr_0_m,age_yr_0_f,age_yr_0_p,age_yr_1_m,age_yr_1_f,age_yr_1_p,age_yr_2_m,age_yr_2_f,age_yr_2_p,age_yr_3_m,...,age_yr_95_99_m,age_yr_95_99_f,age_yr_95_99_p,age_yr_100_yr_over_m,age_yr_100_yr_over_f,age_yr_100_yr_over_p,tot_m,tot_f,tot_p,poa
0,93,80,175,76,83,157,66,68,136,60,...,7,8,10,0,0,0,14223,13713,27936,2000
1,19,27,52,24,23,45,26,14,42,26,...,0,5,4,0,0,0,3763,3644,7410,2007
2,35,25,62,23,19,43,22,15,36,18,...,0,3,3,0,0,0,5315,5083,10400,2008
3,72,63,138,72,54,128,66,69,136,53,...,0,3,6,0,0,0,6391,6267,12658,2009
4,88,82,167,64,70,138,69,42,111,55,...,5,19,28,0,0,0,15356,11082,26443,2010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2638,6,6,15,13,6,20,11,5,15,7,...,0,0,0,0,0,0,1060,1130,2188,2899
2639,7,5,13,11,3,14,9,7,17,12,...,0,0,0,0,0,0,1006,685,1692,6798
2640,5,0,4,3,0,8,4,10,10,6,...,0,0,0,0,0,0,302,292,593,6799
2641,189,172,363,216,198,422,259,182,443,238,...,11,12,16,0,0,0,28851,23214,52069,9494


Generate population by age intervals (18-24, 25-34, 35-44, 45-54, 55-64, 65+) at POA and national level

In [18]:
cols = []
for start_yr, end_yr in zip([18,25,35,45,55], [25,35,45,55,65]):
    for g in ['m', 'f', 'p']:
        col = f'age_{start_yr}_{end_yr - 1}_{g}'
        cols.append(col)
        age_df[col] = age_df.filter(regex = '|'.join([f'(age_yr_{x}_{g})'for x in range(start_yr,end_yr)])).astype(int).sum(axis = 1)


for g in ['m', 'f', 'p']:
    col = f'age_65+_{g}'
    cols.append(col)
    age_df[col] = age_df.filter(regex = '|'.join([f'(age_yr_{x}_{x+4}_{g})'for x in range(65,100, 5)] + ['age_yr_100_yr_over_[mf]'])).astype(int).sum(axis = 1)


age_df = age_df[['poa'] + cols].melt(id_vars = 'poa')

def get_gender(variable):
    g = variable[-1]
    if g == 'm':
        return 'Male'
    if g == 'f':
        return 'Female'
    if g == 'p':
        return 'Person'


age_df['gender'] = age_df['variable'].apply(get_gender)
age_df['variable'] = age_df['variable'].apply(lambda x : x[:-2])
age_df = pd.pivot_table(age_df, values = 'value', index =['poa', 'gender'], columns='variable')


idx = pd.IndexSlice
aus_age = age_df.loc[idx[:, 'Person'], :].sum()
aus_age_m = age_df.loc[idx[:, 'Male'], :].sum()
aus_age_f = age_df.loc[idx[:, 'Female'], :].sum()

aus_prob_m = aus_age_m.sum()/(aus_age.sum())

aus_age *= 1/aus_age.sum()
aus_age_m *= 1/aus_age_m.sum()
aus_age_f *= 1/aus_age_f.sum()

age_df = age_df.apply(lambda x : x/x.sum(), axis = 1)
age_df = age_df.fillna(age_df.mean())


In [19]:
age_df

Unnamed: 0_level_0,variable,age_18_24,age_25_34,age_35_44,age_45_54,age_55_64,age_65+
poa,gender,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0800,Female,0.117745,0.395264,0.201801,0.110740,0.108072,0.066378
0800,Male,0.110508,0.350942,0.198172,0.141062,0.113649,0.085665
0800,Person,0.112971,0.373315,0.197737,0.128467,0.110491,0.077018
0810,Female,0.096834,0.256085,0.213678,0.163655,0.121771,0.147977
0810,Male,0.109156,0.240813,0.212239,0.164513,0.128309,0.144970
...,...,...,...,...,...,...,...
9494,Male,0.091518,0.198472,0.196818,0.181263,0.169843,0.162086
9494,Person,0.103749,0.216115,0.190244,0.169565,0.170338,0.149990
9797,Female,0.269231,0.307692,0.134615,0.115385,0.144231,0.028846
9797,Male,0.060686,0.215040,0.335092,0.237467,0.124011,0.027704


In [10]:
age_df.reset_index().to_parquet('../data/curated/census/age_proportions.parquet')

Caculates weights for each combination of gender and postcode

In [20]:
# Constants for probability calculations
prob_bnpl = 0.05

prob_female_g_bnpl = 0.57
prob_male_g_bnpl = 0.43
prob_age_g_bnpl = pd.Series(data = {'age_18_24' : 0.26, 'age_25_34' : 0.35, 'age_35_44' : 0.2, 'age_45_54' : 0.12, 'age_55_64' : 0.04,'age_65+' : 0.01})

def get_prob_bnpl(row):
    if row.name[1] == 'Male':
        return (row*prob_age_g_bnpl*prob_male_g_bnpl*prob_bnpl/aus_age_m/aus_prob_m).sum()
    if row.name[1] == 'Female':
        return (row*prob_age_g_bnpl*prob_female_g_bnpl*prob_bnpl/aus_age_f/(1-aus_prob_m)).sum()
    if row.name[1] == 'Person':
        return (row*prob_age_g_bnpl*prob_bnpl/aus_age).sum()

age_df['weight'] = age_df.apply(get_prob_bnpl, axis = 1)
age_df = age_df[['weight']]


In [21]:
age_df

Unnamed: 0_level_0,variable,weight
poa,gender,Unnamed: 2_level_1
0800,Female,0.078522
0800,Male,0.056507
0800,Person,0.067098
0810,Female,0.063853
0810,Male,0.048913
...,...,...
9494,Male,0.043877
9494,Person,0.052927
9797,Female,0.086617
9797,Male,0.050038


In [13]:
prob_age_g_bnpl*prob_bnpl/aus_age

age_18_24    0.120315
age_25_34    0.096074
age_35_44    0.057053
age_45_54    0.036768
age_55_64    0.013220
age_65+      0.002272
dtype: float64

In [174]:
px.histogram(age_df.loc[idx[:, 'Female'], :], x = 'weight')

In [175]:
px.histogram(age_df.loc[idx[:, 'Male'], :], x = 'weight')

Transform index and write to disk

In [111]:
age_df = age_df.reset_index()
age_df['gender'] = age_df['gender'].apply(lambda x : 'Undisclosed' if x == 'Person' else x)

age_df.to_parquet('../data/curated/demographic_weights.parquet', index = False)