In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import functions as F

spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.driver.memory", '4g')
    .config("spark.executor.memory", '8g')
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.sql.parquet.enableVectorizedReader","false")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .getOrCreate()
)

22/09/26 20:01:17 WARN Utils: Your hostname, DESKTOP-3NQ3PQI resolves to a loopback address: 127.0.1.1; using 172.31.182.220 instead (on interface eth0)
22/09/26 20:01:17 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/26 20:01:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/09/26 20:01:20 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


### Read in data

In [10]:
weights_sdf = spark.read.parquet(
    '../data/curated/demographic_weights.parquet'
)
consumers_sdf = spark.read.parquet(
    '../data/curated/cleaned_consumers.parquet'
)

user_details_sdf = spark.read.parquet(
    '../data/tables/consumer_user_details.parquet'
)

postcode_poa_sdf = spark.read.parquet(
    '../data/curated/census/postcode_poa.parquet'
)
# Reading in all transaction data and joining them
"""transactions_sdf = spark.read.parquet(
    '../data/tables/transactions_20210228_20210827_snapshot'
).union(
    spark.read.parquet(
        '../data/tables/transactions_20210828_20220227_snapshot'
    )
).union(
    spark.read.parquet(
        '../data/tables/transactions_20220228_20220828_snapshot'
    )
)"""
transactions_sdf = spark.read.parquet('../data/raw/samples/transaction_sample.parquet')

In [11]:
postcode_poa_sdf.limit(1)

postcode,poa
200,2601


In [7]:
transactions_sdf.limit(1)

user_id,merchant_abn,dollar_value,order_id,order_datetime
14963,75089928159,157.395489968075,230f0253-31ac-43b...,2021-11-26


In [15]:
user_details_sdf.limit(1)

user_id,consumer_id
1,1195503


### Join transactional data with consumer data

In [12]:
transactions_sdf = transactions_sdf.join(
    user_details_sdf,
    on = 'user_id',
    how = 'left'
).join(
    consumers_sdf.select(
        'consumer_id', 'postcode', 'gender'
    ),
    on = 'consumer_id',
    how = 'left'
).join(
    postcode_poa_sdf,
    on = 'postcode',
    how = 'left'
).join(
    weights_sdf,
    on = ['poa' ,'gender'],
    how = 'left'
)

Displays number of null values for each column (resulting from above joins)

In [20]:
transactions_sdf.select([F.count(F.when(F.col(x).isNull(), x)) for x in transactions_sdf.columns])

count(CASE WHEN (poa IS NULL) THEN poa END),count(CASE WHEN (gender IS NULL) THEN gender END),count(CASE WHEN (postcode IS NULL) THEN postcode END),count(CASE WHEN (consumer_id IS NULL) THEN consumer_id END),count(CASE WHEN (user_id IS NULL) THEN user_id END),count(CASE WHEN (merchant_abn IS NULL) THEN merchant_abn END),count(CASE WHEN (dollar_value IS NULL) THEN dollar_value END),count(CASE WHEN (order_id IS NULL) THEN order_id END),count(CASE WHEN (order_datetime IS NULL) THEN order_datetime END),count(CASE WHEN (weight IS NULL) THEN weight END)
449,345,345,0,0,0,0,0,0,449


In [9]:
sdf_usertransaction = sdf_userdetails.join(sdf_transactions, on='user_id')
sdf_consumer_transaction = sdf_usertransaction.join(sdf_consumer, on='consumer_id')
sdf_consumer_transaction.show(5)

                                                                                

+-----------+-------+------------+------------------+--------------------+--------------+-----+--------+------+
|consumer_id|user_id|merchant_abn|      dollar_value|            order_id|order_datetime|state|postcode|gender|
+-----------+-------+------------+------------------+--------------------+--------------+-----+--------+------+
|     651338|  18478| 62191208634|63.255848959735246|949a63c8-29f7-4ab...|    2021-08-20|  TAS|    7001|  Male|
|     179208|      2| 15549624934| 130.3505283105634|6a84c3cf-612a-457...|    2021-08-20|  NSW|    2782|Female|
|     467663|  18479| 64403598239|120.15860593212783|b10dcc33-e53f-425...|    2021-08-20|  TAS|    7010|Female|
|    1194530|      3| 60956456424| 136.6785200286976|0f09c5a5-784e-447...|    2021-08-20|   NT|    0862|Female|
|     467663|  18479| 94493496784| 72.96316578355305|f6c78c1a-4600-4c5...|    2021-08-20|  TAS|    7010|Female|
+-----------+-------+------------+------------------+--------------------+--------------+-----+--------+

In [10]:
# Checking final count to see if it joined properly
sdf_consumer_transaction.count()

                                                                                

14164820

In [11]:
sdf_consumer_transaction.printSchema()

root
 |-- consumer_id: long (nullable = true)
 |-- user_id: long (nullable = true)
 |-- merchant_abn: long (nullable = true)
 |-- dollar_value: double (nullable = true)
 |-- order_id: string (nullable = true)
 |-- order_datetime: date (nullable = true)
 |-- state: string (nullable = true)
 |-- postcode: string (nullable = true)
 |-- gender: string (nullable = true)



### Look at gender

In [12]:
# Checking gender categories
sdf_consumer_transaction.groupBy('gender').count()

                                                                                

gender,count
Undisclosed,1438927
Female,6296649
Male,6429244


With our genders, it may be a promising feature which woucl dbe used further along the line. Thus, we want to keep it for future references. However, there are people who wish to not disclose their gender and a gender feature as 'undisclosed' has appeared. The following code deals with attempting to find if there has been any past that customers who have said 'undisclosed' have provided us.

For the first set of data, doesn't seem like there's any overlap. Smoge useless analysis

In [13]:
# Checking if there is only correct states
sdf_consumer_transaction.groupBy('state').count()

                                                                                

state,count
NT,202178
ACT,124454
SA,1612955
TAS,525947
WA,2238788
QLD,2100381
VIC,3278493
NSW,4081624


### Round dollar values to 2dp

In [14]:
# Round to 2 decimal places, and define a transaction range
min_value = 5
max_value = 10000
sdf_consumer_transaction = sdf_consumer_transaction.withColumn('dollar_value', F.round(F.col('dollar_value'), 2))
sdf_consumer_transaction = sdf_consumer_transaction.where(
    (F.col('dollar_value') >= min_value)
    & (F.col('dollar_value') <= max_value)
)

### Check ABN validity

In [15]:
# Make sure ABN is valid, takes in long

def validateABN(merchant_abn):

    str_abn = str(merchant_abn)

    if len(str_abn) == 11:
        return True
    else:
        return False

In [16]:
# Create a list of all row values, used for validating ABN

sdf_list = sdf_consumer_transaction.select("merchant_abn").collect()

                                                                                

In [17]:
# Find any merchants without a valid ABN

i = 0
invalidABN = []

while i < len(sdf_list):
    abn = str(sdf_list[i].__getitem__('merchant_abn'))
    if validateABN(abn) == False:
        invalidABN.append(abn)
    i += 1

In [18]:
invalidABN

[]

ez no invalid abn

In [19]:
# Checking date range
start_date = '2021-02-28'
end_date = '2022-08-28'
sdf_consumer_transaction = sdf_consumer_transaction.where(
    (F.col('order_datetime') >= start_date) & (F.col('order_datetime') <= end_date)
)

In [20]:
sdf_consumer_transaction.count()

                                                                                

11965964

In [21]:
# Export cleaned data
sdf_consumer_transaction.write.mode('overwrite').parquet('../data/curated/cleaned_transactions.parquet')

                                                                                