# The following code is for external data processing

In [3]:
from pyspark.sql import SparkSession
import re
from pyspark.sql import functions as F
import numpy as np
import pandas as pd
import re

spark = (
    SparkSession.builder.appName("MAST30034 Project 2 Preprocessing")
    .config("spark.driver.memory", '4g')
    .config("spark.executor.memory", '8g')
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.sql.parquet.enableVectorizedReader","false")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.parquet.writeLegacyFormat", 'true')
    .getOrCreate()
)

22/09/13 19:51:36 WARN Utils: Your hostname, DESKTOP-3NQ3PQI resolves to a loopback address: 127.0.1.1; using 172.24.53.113 instead (on interface eth0)
22/09/13 19:51:36 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/13 19:51:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Median Age

In [4]:
transactions_sdf = spark.read.parquet(
    '../data/tables/transactions_20210228_20210827_snapshot/'
).union(
    spark.read.parquet(
        '../data/tables/transactions_20210828_20220227_snapshot/'
    )
).union(
    spark.read.parquet(
        '../data/tables/transactions_20220228_20220828_snapshot/'
    )
)

ids_sdf = spark.read.parquet(
    '../data/tables/consumer_user_details.parquet'
)

consumers_sdf = spark.read.options(
    header = True, delimiter = '|'
).csv(
    '../data/tables/tbl_consumer.csv'
)

age_sdf = spark.read.options(
    header = True
).csv(
    '../data/curated/census/age_data.csv'
)

                                                                                

In [3]:
transactions_sdf.sample(0.01).write.parquet('../data/raw/samples/transaction_sample.parquet')

                                                                                

In [4]:
transactions_sdf = spark.read.parquet('../data/raw/samples/transaction_sample.parquet')

In [None]:

# Creates dataframe grouped by merchant and postcode with propn of customers for each corresponding postcode
merchants_sdf = transactions_sdf.join(
    ids_sdf,
    on = 'user_id'
).join(
    consumers_sdf,
    on = 'consumer_id'
).groupBy(
    'merchant_abn', 'postcode'
).count().join(
    transactions_sdf.groupby(
        'merchant_abn'
    ).count().withColumnRenamed(
        'count',
        'size'
    ),
    on = 'merchant_abn'
).withColumn(
    'propn',
    F.col('count')/F.col('size')
).drop(
    'count',
    'size'
)

# Joins merchant and postcode data with abs data for population by age
merchants_sdf = merchants_sdf.join(
    age_sdf,
    on = 'postcode'
)

# Creates scaled version of each population metric by age
for col in age_sdf.columns:
    if col == 'postcode':
        continue
    merchants_sdf = merchants_sdf.withColumn(
        col+'_scaled',
        F.col(col)*F.col('propn')
    )

# Removes non scaled columns (used to make the scaled columns) and calculates weighted sum of each population metric by propn of customers from that postcode
merchants_sdf = merchants_sdf.select(
    merchants_sdf.colRegex("`merchant_abn|.*_scaled`")
).groupBy(
    'merchant_abn'
).sum()

merchants_df = merchants_sdf.toPandas()

# Renames columns and sets index
merchants_df = merchants_df.drop(
    'sum(merchant_abn)',
    axis = 1
).set_index(
    'merchant_abn'
).rename(
    columns = {col : col[4:-1] for col in merchants_df.columns}
)

In [15]:
# Calculates median of each row in dataframe where each row corresponds to a count of the given column value
def get_median_col(df):
    median_count = df.sum(axis = 1)/2

    return df.cumsum(axis = 1).apply(
        lambda col : (col > median_count)
    ).idxmax(
        axis = 1
    )

# Executes get_medial_col function for males females and persons    
for person_type in ['m', 'f', 'p']:
    merchants_df[f'median_age_{person_type}'] = get_median_col(
        merchants_df.filter(
            regex = f'age_yr_(\d+|(80_84)|(85_89)|(90_94)|(95_99)|(100_yr_over))_{person_type}_scaled',
            axis = 1
        )
    ).apply(
        lambda x : re.findall('\d+', x)[0]
    )


                                                                                

In [133]:
df

Unnamed: 0_level_0,age_yr_0_m_scaled,age_yr_1_m_scaled,age_yr_2_m_scaled,age_yr_3_m_scaled,age_yr_4_m_scaled,age_yr_5_m_scaled,age_yr_6_m_scaled,age_yr_7_m_scaled,age_yr_8_m_scaled,age_yr_9_m_scaled,...,age_yr_75_m_scaled,age_yr_76_m_scaled,age_yr_77_m_scaled,age_yr_78_m_scaled,age_yr_79_m_scaled,age_yr_80_84_m_scaled,age_yr_85_89_m_scaled,age_yr_90_94_m_scaled,age_yr_95_99_m_scaled,age_yr_100_yr_over_m_scaled
merchant_abn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
83412691377,42.303030,43.212121,41.833333,43.166667,43.583333,44.280303,44.250000,45.742424,45.280303,45.098485,...,29.196970,26.143939,24.643939,20.166667,20.090909,77.500000,42.060606,18.113636,3.969697,0.212121
24406529929,59.078947,57.394737,58.447368,58.052632,55.842105,59.684211,58.894737,56.921053,57.236842,57.078947,...,29.815789,27.184211,23.894737,19.631579,21.684211,80.815789,44.263158,21.736842,5.684211,0.552632
73256306726,39.849057,42.792453,43.377358,43.490566,46.000000,47.188679,48.773585,46.962264,47.886792,46.603774,...,29.584906,28.283019,25.622642,21.396226,21.018868,77.433962,41.584906,18.924528,4.150943,0.226415
35344855546,27.956522,27.478261,30.000000,29.695652,32.391304,33.130435,32.956522,33.608696,34.000000,34.000000,...,19.173913,16.043478,15.434783,12.260870,13.391304,49.652174,29.260870,11.652174,2.913043,0.391304
38700038932,41.513514,42.189189,42.594595,42.702703,42.797297,45.729730,43.783784,46.283784,47.689189,47.675676,...,23.283784,20.945946,19.243243,16.905405,16.689189,61.932432,31.202703,14.094595,3.148649,0.243243
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56030772622,7.000000,6.000000,4.000000,3.000000,9.000000,6.000000,3.000000,6.000000,3.000000,11.000000,...,3.000000,5.000000,5.000000,5.000000,4.000000,10.000000,3.000000,0.000000,0.000000,0.000000
41305045812,115.500000,112.000000,123.500000,129.000000,128.000000,118.000000,138.000000,136.000000,128.000000,135.500000,...,110.500000,116.500000,99.500000,93.000000,95.000000,335.000000,203.500000,90.000000,25.000000,0.000000
81906511933,36.500000,32.000000,31.000000,24.000000,31.500000,33.000000,33.000000,34.000000,32.500000,31.000000,...,26.000000,23.500000,16.500000,16.500000,18.000000,76.000000,62.000000,26.500000,10.000000,0.000000
72296826112,42.000000,44.333333,42.000000,34.333333,43.333333,47.000000,38.000000,34.333333,36.000000,34.333333,...,12.666667,11.666667,14.000000,9.333333,13.000000,27.333333,12.333333,3.333333,0.000000,0.000000


In [147]:
merchants_df

Unnamed: 0_level_0,age_yr_0_m_scaled,age_yr_0_f_scaled,age_yr_0_p_scaled,age_yr_1_m_scaled,age_yr_1_f_scaled,age_yr_1_p_scaled,age_yr_2_m_scaled,age_yr_2_f_scaled,age_yr_2_p_scaled,age_yr_3_m_scaled,...,age_yr_95_99_p_scaled,age_yr_100_yr_over_m_scaled,age_yr_100_yr_over_f_scaled,age_yr_100_yr_over_p_scaled,tot_m_scaled,tot_f_scaled,tot_p_scaled,median_age_m,median_age_f,median_age_p
merchant_abn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
83412691377,42.303030,41.037879,83.242424,43.212121,39.272727,82.431818,41.833333,39.492424,81.325758,43.166667,...,13.992424,0.212121,1.340909,1.628788,3691.560606,3798.916667,7490.553030,38,40,39
24406529929,59.078947,54.552632,112.789474,57.394737,57.052632,114.236842,58.447368,52.526316,111.631579,58.052632,...,16.789474,0.552632,1.684211,2.263158,5081.710526,5219.000000,10300.657895,34,35,35
73256306726,39.849057,39.320755,79.169811,42.792453,42.207547,85.094340,43.377358,41.471698,85.094340,43.490566,...,14.452830,0.226415,1.301887,1.528302,3675.018868,3821.433962,7496.867925,39,41,40
35344855546,27.956522,27.000000,54.391304,27.478261,29.217391,56.913043,30.000000,28.130435,58.000000,29.695652,...,7.565217,0.391304,0.652174,1.086957,2437.217391,2504.782609,4941.173913,39,40,40
38700038932,41.513514,40.202703,81.405405,42.189189,39.391892,81.662162,42.594595,39.472973,82.337838,42.702703,...,11.216216,0.243243,1.081081,1.337838,3517.837838,3594.378378,7112.202703,37,39,38
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56030772622,7.000000,3.000000,4.000000,6.000000,3.000000,8.000000,4.000000,3.000000,12.000000,3.000000,...,0.000000,0.000000,0.000000,0.000000,433.000000,392.000000,825.000000,44,39,42
41305045812,115.500000,126.000000,243.500000,112.000000,109.500000,222.500000,123.500000,114.000000,237.500000,129.000000,...,84.500000,0.000000,4.000000,9.000000,11153.500000,12173.500000,23323.000000,43,46,44
81906511933,36.500000,24.000000,59.000000,32.000000,33.500000,62.500000,31.000000,26.000000,53.500000,24.000000,...,32.500000,0.000000,1.500000,1.500000,3722.500000,4015.500000,7739.000000,32,36,34
72296826112,42.000000,47.000000,89.333333,44.333333,42.000000,87.666667,42.000000,43.000000,82.333333,34.333333,...,2.000000,0.000000,0.000000,0.000000,2446.333333,2457.666667,4901.666667,34,34,34


In [None]:
external_sdf = spark.read.option("header", "true").csv("../data/raw/external/income.csv")
external_sdf= external_sdf.withColumnRenamed(
    "INCP Total Personal Income (weekly)",
    "Income"
)

In [None]:
external_sdf

In [None]:
income_factors = list(set(external_sdf.select(F.collect_list("Income")).first()[0]))

In [None]:
income_factors

As we can see, there seems to be a row per location regarding the total amount of 'Count'. We wish to extract this information and create a separate dataset for easier access to these numbers

In [None]:
location_total = external_sdf.filter(F.col("Income") == "Total")

In [None]:
location_total

In [None]:
external_sdf = external_sdf.where(F.col("Income") != "Total")

# Use regular expression to find amount range

In [None]:
temp_df = external_sdf.select('Income').toPandas()

In [None]:
from readline import append_history_file


output_col = []
pattern = "\((\$\d*,?\d+-\$\d*,*\d*)|(\$\d*,?\d* or more)\)"

for income in temp_df["Income"]:
    matched = re.findall(pattern, income)
    if len(matched) > 0:
        output_col.append(matched[0][0])
    else:
        output_col.append(income)

In [None]:
temp_df['Income Parsed'] = output_col

In [None]:
output_col[14]

In [None]:
test = spark.createDataFrame(temp_df)
test

Ben's preprocessing

In [None]:
income_df = pd.read_csv('../data/raw/external/income.csv')
income_df = income_df.drop(index=range(len(income_df) - 4, len(income_df)))

In [None]:
income_df = income_df.rename(columns = {"SA2 (UR)" : "Region", "INCP Total Personal Income (weekly)" : "Income"})

In [None]:
def convert_income(row):
    if row['Income'] in ['Negative income', 'Nil income', 'Total', 'Not stated', 'Not applicable']:
        row['weekly_income'] = row['Income']
        row['yearly_income'] = row['Income']
    else:
        matches = re.findall('(.*)\s\((.*)\)', row['Income'])
        row['weekly_income'] = matches[0][0]
        row['yearly_income'] = matches[0][1]
    return row

In [None]:
income_df = income_df.apply(convert_income, axis = 1)

    

In [None]:
income_df