In [1]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.driver.memory", '4g')
    .config("spark.executor.memory", '8g')
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.sql.parquet.enableVectorizedReader","false")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .getOrCreate()
)

22/09/19 12:31:59 WARN Utils: Your hostname, dash_surface resolves to a loopback address: 127.0.1.1; using 172.31.0.166 instead (on interface eth0)
22/09/19 12:31:59 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/19 12:32:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
from pyspark.sql.functions import *
from pyspark.sql import functions as F

### Load consumer data

In [3]:
# Run consumerPreprocessing.ipynb to get data

sdf_consumer = spark.read.parquet('../data/curated/cleaned_consumers')
sdf_consumer.show(5, truncate=False)

[Stage 1:>                                                          (0 + 1) / 1]

+------------------+----------------------------+-----+--------+-----------+-----------+
|name              |address                     |state|postcode|gender     |consumer_id|
+------------------+----------------------------+-----+--------+-----------+-----------+
|Jonathan Compton  |0815 Alicia Centers         |NSW  |2594    |Male       |650538     |
|Alice Roberts     |76407 Audrey Falls Apt. 548 |NSW  |1630    |Female     |1401331    |
|Michael Leach DVM |16427 Webster Orchard       |NSW  |1108    |Male       |3341       |
|Michelle Rodriguez|805 Justin Landing Suite 305|NSW  |2506    |Undisclosed|624863     |
|Tara Harris       |0528 Andrea Ferry           |NSW  |2442    |Female     |193993     |
+------------------+----------------------------+-----+--------+-----------+-----------+
only showing top 5 rows



                                                                                

In [4]:
sdf_consumer.count()

                                                                                

498933

### Load transactional data

In [5]:
# Reading in all transaction data and joining them
sdf_transactions1 = spark.read.parquet('../data/tables/transactions_20210228_20210827_snapshot')
sdf_transactions2 = spark.read.parquet('../data/tables/transactions_20210828_20220227_snapshot')
sdf_transactions3 = spark.read.parquet('../data/tables/transactions_20220228_20220828_snapshot')

sdf_transactions = sdf_transactions1.union(sdf_transactions2)
sdf_transactions = sdf_transactions.union(sdf_transactions3)

                                                                                

In [6]:
sdf_transactions.count()

                                                                                

14195505

### Read consumer details (joining table)

In [7]:
sdf_userdetails = spark.read.parquet('../data/tables/consumer_user_details.parquet')

### Join transactional data with consumer data

In [8]:
sdf_usertransaction = sdf_userdetails.join(sdf_transactions, on='user_id')
sdf_consumer_transaction = sdf_usertransaction.join(sdf_consumer, on='consumer_id')

In [9]:
# Only keeping useful features
sdf_consumer_transaction = sdf_consumer_transaction.select("consumer_id","user_id",'merchant_abn','dollar_value',
                            'order_datetime','state','postcode','gender')               

### Look at gender

In [10]:
# Checking gender categories
sdf_consumer_transaction.groupBy('gender').count()

                                                                                

gender,count
Undisclosed,1440093
Female,6296649
Male,6433949


With our genders, it may be a promising feature which woucl dbe used further along the line. Thus, we want to keep it for future references. However, there are people who wish to not disclose their gender and a gender feature as 'undisclosed' has appeared. The following code deals with attempting to find if there has been any past that customers who have said 'undisclosed' have provided us.

For the first set of data, doesn't seem like there's any overlap. Smoge useless analysis

### Round dollar values to 2dp

In [12]:
# Round to 2 decimal places, and define a transaction range
min_value = 0
max_value = 10000
sdf_consumer_transaction = sdf_consumer_transaction.withColumn('dollar_value', F.round(F.col('dollar_value'), 2))
sdf_consumer_transaction = sdf_consumer_transaction.where(
    (F.col('dollar_value') > min_value)
    & (F.col('dollar_value') <= max_value)
)

### Check ABN validity

In [13]:
# Make sure ABN is valid, takes in long

def validateABN(merchant_abn):

    str_abn = str(merchant_abn)

    if len(str_abn) == 11:
        return True
    else:
        return False

In [14]:
# Create a list of all row values, used for validating ABN

sdf_list = sdf_consumer_transaction.select("merchant_abn").collect()

                                                                                

In [15]:
# Find any merchants without a valid ABN

i = 0
invalidABN = []

while i < len(sdf_list):
    abn = str(sdf_list[i].__getitem__('merchant_abn'))
    if validateABN(abn) == False:
        invalidABN.append(abn)
    i += 1

In [16]:
invalidABN

[]

ez no invalid abn

In [17]:
# Checking date range
start_date = '2021-02-28'
end_date = '2022-08-28'
sdf_consumer_transaction = sdf_consumer_transaction.where(
    (F.col('order_datetime') >= start_date) & (F.col('order_datetime') <= end_date)
)
sdf_consumer_transaction.count()

                                                                                

12534085

In [19]:
# Export cleaned data

sdf_consumer_transaction.write.parquet("../data/curated/cleaned_transactions.parquet")

                                                                                