In [66]:
import pandas as pd
from collections import Counter
from pyspark.sql.functions import *

In [7]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.driver.memory", '4g')
    .config("spark.executor.memory", '8g')
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.sql.parquet.enableVectorizedReader","false")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .getOrCreate()
)

22/09/13 10:19:07 WARN Utils: Your hostname, LAPTOP-4PS96C35 resolves to a loopback address: 127.0.1.1; using 172.24.42.143 instead (on interface eth0)
22/09/13 10:19:07 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/13 10:19:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/09/13 10:19:19 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [28]:
merchants_df = spark.read.parquet('../data/curated/merchants_df.parquet')

In [29]:
merchants_df.show(5)

+------------+--------------------+--------------------+-------------+---------+
|merchant_abn|                name|            category|revenue_level|take_rate|
+------------+--------------------+--------------------+-------------+---------+
| 10023283211|       Felis Limited|[furniture, home ...|            e|     0.18|
| 10142254217|Arcu Ac Orci Corp...|[cable, satellite...|            b|     4.22|
| 10165489824|    Nunc Sed Company|[jewelry, watch, ...|            b|      4.4|
| 10187291046|Ultricies Digniss...|[watch, clock, je...|            b|     3.29|
| 10192359162| Enim Condimentum PC|[music shops - mu...|            a|     6.33|
+------------+--------------------+--------------------+-------------+---------+
only showing top 5 rows



In [8]:
transactions_1 = spark.read.parquet('../data/curated/transactions_1.parquet')

                                                                                

In [9]:
transactions_1.limit(5)

                                                                                

consumer_id,user_id,merchant_abn,dollar_value,order_datetime,state,postcode,gender
561,14108,32709545238,361.7919109492337,2021-08-21,NSW,2293,Male
561,14108,19839532017,157.0,2021-08-21,NSW,2293,Male
561,14108,14639489823,217.70536080483544,2021-08-21,NSW,2293,Male
561,14108,69666829657,178.1273076070476,2021-08-21,NSW,2293,Male
561,14108,26148653604,15.598745581918507,2021-08-21,NSW,2293,Male


In [89]:
company_total_1 = transactions_1.groupBy("merchant_abn").sum('dollar_value')
company_total_1 = company_total_1.withColumnRenamed('sum(dollar_value)', 'total_transactions')
# company_total_1.show(5)



+------------+------------------+
|merchant_abn|total_transactions|
+------------+------------------+
| 19839532017|           29830.0|
| 83412691377|127216.11957102377|
| 38700038932|2434872.0408709096|
| 35344855546|33886.502992264264|
| 73256306726| 403476.5997903018|
+------------+------------------+
only showing top 5 rows



                                                                                

In [74]:
# Testing cell
# transactions_1.select(countDistinct('merchant_abn'))
# transactions_1.groupBy('merchant_abn').count()

In [83]:
transactions_per_company = transactions_1.groupBy('merchant_abn').count()
transactions_per_company = transactions_per_company.withColumnRenamed('merchant_abn', 'abn')
# transactions_per_company.show(5)

In [90]:
joined_merchants_df = merchants_df.join(company_total_1, merchants_df.merchant_abn == company_total_1.merchant_abn, 'inner').drop(company_total_1['merchant_abn'])
# joined_merchants_df.show(5)

In [91]:
joined_merchants_df = joined_merchants_df.join(transactions_per_company, joined_merchants_df.merchant_abn == transactions_per_company.abn, 'inner').drop('abn')
joined_merchants_df.show(5)



+------------+--------------------+--------------------+-------------+---------+------------------+-----+
|merchant_abn|                name|            category|revenue_level|take_rate|total_transactions|count|
+------------+--------------------+--------------------+-------------+---------+------------------+-----+
| 19839532017|Pellentesque Habi...|[cable, satellite...|            b|     4.94|           29830.0|  190|
| 83412691377|Suspendisse Sagit...|[watch, clock, je...|            c|     2.94|127216.11957102377| 3645|
| 38700038932|Etiam Bibendum In...|[tent, awning shops]|            a|     6.31|2434872.0408709096| 1846|
| 35344855546|Quis Tristique Ac...|[watch, clock, je...|            c|     2.92|33886.502992264264|  384|
| 73256306726|              Id LLP|[health, beauty s...|            b|     4.81| 403476.5997903018| 1378|
+------------+--------------------+--------------------+-------------+---------+------------------+-----+
only showing top 5 rows



                                                                                

In [94]:
joined_merchants_df = joined_merchants_df.withColumnRenamed('count', 'no_transactions')
joined_merchants_df.withColumn('avg_transaction', joined_merchants_df.total_transactions / joined_merchants_df.no_transactions).show(5)

                                                                                

+------------+--------------------+--------------------+-------------+---------+------------------+---------------+------------------+
|merchant_abn|                name|            category|revenue_level|take_rate|total_transactions|no_transactions|   avg_transaction|
+------------+--------------------+--------------------+-------------+---------+------------------+---------------+------------------+
| 19839532017|Pellentesque Habi...|[cable, satellite...|            b|     4.94|           29830.0|            190|             157.0|
| 83412691377|Suspendisse Sagit...|[watch, clock, je...|            c|     2.94|127216.11957102377|           3645| 34.90154172044548|
| 38700038932|Etiam Bibendum In...|[tent, awning shops]|            a|     6.31|2434872.0408709096|           1846|1318.9989387166358|
| 35344855546|Quis Tristique Ac...|[watch, clock, je...|            c|     2.92|33886.502992264264|            384| 88.24610154235485|
| 73256306726|              Id LLP|[health, beauty s...

In [79]:
joined_merchants_df.sort(col('sum(dollar_value)').desc()).show(20)

                                                                                

+------------+--------------------+--------------------+-------------+---------+------------------+
|merchant_abn|                name|            category|revenue_level|take_rate| sum(dollar_value)|
+------------+--------------------+--------------------+-------------+---------+------------------+
| 39649557865|Arcu Morbi Institute|[artist supply, c...|            c|     1.47|2586772.5714433747|
| 79827781481|     Amet Risus Inc.|[furniture, home ...|            a|     6.82|  2569945.73735937|
| 31334588839|Lacus Aliquam Cor...|[antique shops - ...|            b|     4.22| 2506651.862665567|
| 27093785141|Placerat Orci Ins...|[stationery, offi...|            c|     2.73|2505283.7549576736|
| 96680767841|      Ornare Limited|[motor vehicle su...|            a|     5.91|2471313.3819261594|
| 86578477987|   Leo In Consulting|[watch, clock, je...|            a|     6.43|2451541.3464945666|
| 32709545238|Tempor Est Founda...|[stationery, offi...|            c|     3.04|2448476.5028270427|
