In [1]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.driver.memory", '4g')
    .config("spark.executor.memory", '8g')
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.sql.parquet.enableVectorizedReader","false")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .getOrCreate()
)

22/09/19 17:06:08 WARN Utils: Your hostname, DESKTOP-5U9FK5P resolves to a loopback address: 127.0.1.1; using 172.28.47.33 instead (on interface eth0)
22/09/19 17:06:08 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/19 17:06:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/09/19 17:06:12 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
from pyspark.sql.functions import *
from pyspark.sql import functions as F

### Load consumer data

In [3]:
sdf_consumer = spark.read.parquet('../data/curated/cleaned_consumers.parquet')
sdf_consumer.show(5, truncate=False)

[Stage 1:>                                                          (0 + 1) / 1]

+-----+--------+------+-----------+
|state|postcode|gender|consumer_id|
+-----+--------+------+-----------+
|NSW  |1143    |Male  |389526     |
|NSW  |1427    |Female|152935     |
|NSW  |1172    |Female|225141     |
|NSW  |2480    |Male  |39631      |
|NSW  |2340    |Male  |1168227    |
+-----+--------+------+-----------+
only showing top 5 rows



                                                                                

In [4]:
sdf_consumer.count()

                                                                                

498790

### Load transactional data

In [5]:
# Reading in all transaction data and joining them
sdf_transactions1 = spark.read.parquet('../data/tables/transactions_20210228_20210827_snapshot')
sdf_transactions2 = spark.read.parquet('../data/tables/transactions_20210828_20220227_snapshot')
sdf_transactions3 = spark.read.parquet('../data/tables/transactions_20220228_20220828_snapshot')

sdf_transactions = sdf_transactions1.union(sdf_transactions2)
sdf_transactions = sdf_transactions.union(sdf_transactions3)

sdf_transactions.show(10, truncate=False)

                                                                                

+-------+------------+------------------+------------------------------------+--------------+
|user_id|merchant_abn|dollar_value      |order_id                            |order_datetime|
+-------+------------+------------------+------------------------------------+--------------+
|18478  |62191208634 |63.255848959735246|949a63c8-29f7-4ab0-ada4-99ac50a88952|2021-08-20    |
|2      |15549624934 |130.3505283105634 |6a84c3cf-612a-4574-835b-144a47353eff|2021-08-20    |
|18479  |64403598239 |120.15860593212783|b10dcc33-e53f-4254-863c-de5266810cbc|2021-08-20    |
|3      |60956456424 |136.6785200286976 |0f09c5a5-784e-4477-b049-8ee4dd069b7b|2021-08-20    |
|18479  |94493496784 |72.96316578355305 |f6c78c1a-4600-4c5f-8e97-6e9eb534b586|2021-08-20    |
|3      |76819856970 |448.529684285612  |5ace6a24-cdf0-4aa3-b571-1d9406b352b5|2021-08-20    |
|18479  |67609108741 |86.4040605836911  |d0e180f0-cb06-42a3-bd1a-c47dca15bc55|2021-08-20    |
|3      |34096466752 |301.5793450525113 |6fb1ff48-24bb-4f97-

In [6]:
sdf_transactions.count()

                                                                                

14195505

### Read consumer details (joining table)

In [7]:
sdf_userdetails = spark.read.parquet('../data/tables/consumer_user_details.parquet')
sdf_userdetails.show(10, truncate=False)

+-------+-----------+
|user_id|consumer_id|
+-------+-----------+
|1      |1195503    |
|2      |179208     |
|3      |1194530    |
|4      |154128     |
|5      |712975     |
|6      |407340     |
|7      |511685     |
|8      |448088     |
|9      |650435     |
|10     |1058499    |
+-------+-----------+
only showing top 10 rows



In [8]:
sdf_userdetails.count()

499999

### Join transactional data with consumer data

In [9]:
sdf_usertransaction = sdf_userdetails.join(sdf_transactions, on='user_id')
sdf_consumer_transaction = sdf_usertransaction.join(sdf_consumer, on='consumer_id')
sdf_consumer_transaction.show(5)

                                                                                

+-----------+-------+------------+------------------+--------------------+--------------+-----+--------+------+
|consumer_id|user_id|merchant_abn|      dollar_value|            order_id|order_datetime|state|postcode|gender|
+-----------+-------+------------+------------------+--------------------+--------------+-----+--------+------+
|     651338|  18478| 62191208634|63.255848959735246|949a63c8-29f7-4ab...|    2021-08-20|  TAS|    7001|  Male|
|     179208|      2| 15549624934| 130.3505283105634|6a84c3cf-612a-457...|    2021-08-20|  NSW|    2782|Female|
|     467663|  18479| 64403598239|120.15860593212783|b10dcc33-e53f-425...|    2021-08-20|  TAS|    7010|Female|
|    1194530|      3| 60956456424| 136.6785200286976|0f09c5a5-784e-447...|    2021-08-20|   NT|    0862|Female|
|     467663|  18479| 94493496784| 72.96316578355305|f6c78c1a-4600-4c5...|    2021-08-20|  TAS|    7010|Female|
+-----------+-------+------------+------------------+--------------------+--------------+-----+--------+

In [10]:
# Checking final count to see if it joined properly
sdf_consumer_transaction.count()

                                                                                

14164820

In [11]:
sdf_consumer_transaction.printSchema()

root
 |-- consumer_id: long (nullable = true)
 |-- user_id: long (nullable = true)
 |-- merchant_abn: long (nullable = true)
 |-- dollar_value: double (nullable = true)
 |-- order_id: string (nullable = true)
 |-- order_datetime: date (nullable = true)
 |-- state: string (nullable = true)
 |-- postcode: string (nullable = true)
 |-- gender: string (nullable = true)



### Look at gender

In [12]:
# Checking gender categories
sdf_consumer_transaction.groupBy('gender').count()

                                                                                

gender,count
Undisclosed,1438927
Female,6296649
Male,6429244


With our genders, it may be a promising feature which woucl dbe used further along the line. Thus, we want to keep it for future references. However, there are people who wish to not disclose their gender and a gender feature as 'undisclosed' has appeared. The following code deals with attempting to find if there has been any past that customers who have said 'undisclosed' have provided us.

For the first set of data, doesn't seem like there's any overlap. Smoge useless analysis

In [13]:
# Checking if there is only correct states
sdf_consumer_transaction.groupBy('state').count()

                                                                                

state,count
NT,202178
ACT,124454
SA,1612955
TAS,525947
WA,2238788
QLD,2100381
VIC,3278493
NSW,4081624


### Round dollar values to 2dp

In [14]:
# Round to 2 decimal places, and define a transaction range
min_value = 5
max_value = 10000
sdf_consumer_transaction = sdf_consumer_transaction.withColumn('dollar_value', F.round(F.col('dollar_value'), 2))
sdf_consumer_transaction = sdf_consumer_transaction.where(
    (F.col('dollar_value') >= min_value)
    & (F.col('dollar_value') <= max_value)
)

### Check ABN validity

In [15]:
# Make sure ABN is valid, takes in long

def validateABN(merchant_abn):

    str_abn = str(merchant_abn)

    if len(str_abn) == 11:
        return True
    else:
        return False

In [16]:
# Create a list of all row values, used for validating ABN

sdf_list = sdf_consumer_transaction.select("merchant_abn").collect()

                                                                                

In [17]:
# Find any merchants without a valid ABN

i = 0
invalidABN = []

while i < len(sdf_list):
    abn = str(sdf_list[i].__getitem__('merchant_abn'))
    if validateABN(abn) == False:
        invalidABN.append(abn)
    i += 1

In [18]:
invalidABN

[]

ez no invalid abn

In [19]:
# Checking date range
start_date = '2021-02-28'
end_date = '2022-08-28'
sdf_consumer_transaction = sdf_consumer_transaction.where(
    (F.col('order_datetime') >= start_date) & (F.col('order_datetime') <= end_date)
)

In [20]:
sdf_consumer_transaction.count()

                                                                                

11965964

In [21]:
# Export cleaned data
sdf_consumer_transaction.write.mode('overwrite').parquet('../data/curated/cleaned_transactions.parquet')

                                                                                