# Observe top and bottom merchants from ranking system

In [8]:
from pyspark.sql.functions import *
from pyspark.sql import functions as F
import pandas as pd

In [2]:
predictions = pd.read_parquet('../data/curated/transaction_predictions.parquet')
merchants_df = pd.read_parquet('../data/curated/merchants.parquet')

In [3]:
merchants_df = merchants_df[['merchant_abn', 'name','segment']]
total_ranking = predictions.groupby('merchant_abn').sum().sort_values(by = 'value', ascending = False)
total_ranking = total_ranking[['value']]
segments_ranking = merchants_df.join(total_ranking, on='merchant_abn', how='right')
total_ranking = segments_ranking[['merchant_abn','name','value']]

misc_df = segments_ranking[segments_ranking['segment'] == 'Miscellaneous']
misc_df = misc_df.sort_values(by = 'value', ascending = False)

home_df = segments_ranking[segments_ranking['segment'] == 'Home']
home_df = home_df.sort_values(by = 'value', ascending = False)

acc_df = segments_ranking[segments_ranking['segment'] == 'Accessories']
acc_df = acc_df.sort_values(by = 'value', ascending = False)

art_df = segments_ranking[segments_ranking['segment'] == 'Art']
art_df = art_df.sort_values(by = 'value', ascending = False)

tech_df = segments_ranking[segments_ranking['segment'] == 'Media & Technology']
tech_df = tech_df.sort_values(by = 'value', ascending = False)

In [4]:
# Observing best merchants

total_ranking.sort_values(by = 'value', ascending = False).head(20)

Unnamed: 0,merchant_abn,name,value
3154,79827781481,Amet Risus Inc.,8040.913574
1034,32361057556,Orci In Consequat Corporation,8030.433594
1766,48534649627,Dignissim Maecenas Foundation,7974.459473
1329,38700038932,Etiam Bibendum Industries,7731.416016
3443,86578477987,Leo In Consulting,7717.433105
3881,96680767841,Ornare Limited,7418.352051
519,21439773999,Mauris Non Institute,7387.193848
1637,45629217853,Lacus Consulting,7343.370605
2386,63123845164,Odio Phasellus Institute,7316.287109
2446,64403598239,Lobortis Ultrices Company,7203.78125


In [13]:
# Observing worst merchants

total_ranking.sort_values(by = 'value', ascending = True).head(20)

Unnamed: 0,merchant_abn,name,value
2060,55403018592,Elit Limited,5.77317e-15
1851,50532670634,Accumsan Laoreet Ipsum Company,5.77317e-15
2640,68591542501,Felis Ltd,5.77317e-15
2,10165489824,Nunc Sed Company,5.77317e-15
594,22853038342,Semper Pretium Limited,5.77317e-15
3428,86201937910,Libero Et LLC,5.77317e-15
3184,80426072728,Massa Non Ante Industries,5.77317e-15
3029,76866488151,Euismod Urna Company,5.77317e-15
381,18261886835,Massa Rutrum LLP,5.77317e-15
2067,55555661470,Nullam Scelerisque LLC,5.77317e-15


# Observe relevant information for selected merchants

In [2]:
from pyspark.sql import SparkSession


spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.driver.memory", '4g')
    .config("spark.executor.memory", '8g')
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.sql.parquet.enableVectorizedReader","false")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .getOrCreate()
)

22/10/09 20:47:29 WARN Utils: Your hostname, dash_surface resolves to a loopback address: 127.0.1.1; using 172.24.195.6 instead (on interface eth0)
22/10/09 20:47:29 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/09 20:47:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
transactions_sdf = spark.read.parquet('../data/curated/cleaned_transactions.parquet/')

                                                                                

In [4]:
merchant_info = spark.read.parquet('../data/curated/merchants.parquet')

### Info on best merchant

In [12]:
# Number 1 ranked merchant

topMerchant = transactions_sdf.where(F.col('merchant_abn') == 79827781481)

In [9]:
# Find take rate

merchant_info.where(F.col('merchant_abn') == 79827781481)

merchant_abn,name,category,revenue_level,take_rate,category_indexed,segment
79827781481,Amet Risus Inc.,"[furniture, home ...",a,0.0682,11,Home


In [9]:
total = topMerchant.count()
maleCount = topMerchant.where(F.col('gender') == "Male").count()
femaleCount = topMerchant.where(F.col('gender') == "Female").count()
undisCount = topMerchant.where(F.col('gender') == "Undisclosed").count()

                                                                                

In [10]:
print(f"Top merchant male proportion: {maleCount/total}")
print(f"Top merchant female proportion: {femaleCount/total}")
print(f"Top merchant undisclosed proportion: {undisCount/total}")
print(f"Top merchant total count: {total}")

Top merchant male proportion: 0.45229681978798586
Top merchant female proportion: 0.44617196702002354
Top merchant undisclosed proportion: 0.10153121319199057
Top merchant total count: 4245


In [15]:
# Find total revnue for merchant

topMerchant.groupBy(F.col('merchant_abn')).agg({
    'dollar_value': 'sum'
})

                                                                                

merchant_abn,sum(dollar_value)
79827781481,8629467.760000002


In [18]:
# Revenue made by BNPL service

topRev = 8629467.76 * 0.0682
topRev

588529.7012319999

### Info on 2nd best merchant

In [19]:
# 2nd best merchant

secondMerchant = transactions_sdf.where(F.col('merchant_abn') == 32361057556)

In [10]:
merchant_info.where(F.col('merchant_abn') == 32361057556)

merchant_abn,name,category,revenue_level,take_rate,category_indexed,segment
32361057556,Orci In Consequat...,"[gift, card, nove...",a,0.0661,12,Miscellaneous


In [12]:
stotal = secondMerchant.count()
smaleCount = secondMerchant.where(F.col('gender') == "Male").count()
sfemaleCount = secondMerchant.where(F.col('gender') == "Female").count()
sundisCount = secondMerchant.where(F.col('gender') == "Undisclosed").count()

                                                                                

In [17]:
print(f"2nd merchant male proportion: {smaleCount/stotal}")
print(f"2nd merchant female proportion: {sfemaleCount/stotal}")
print(f"2nd merchant undisclosed proportion: {sundisCount/stotal}")
print(f"2nd merchant total count: {stotal}")

2nd merchant male proportion: 0.4527753560011625
2nd merchant female proportion: 0.44601199439909117
2nd merchant undisclosed proportion: 0.10121264959974638
2nd merchant total count: 75702


In [20]:
# Find total revnue for merchant

secondMerchant.groupBy(F.col('merchant_abn')).agg({
    'dollar_value': 'sum'
})

                                                                                

merchant_abn,sum(dollar_value)
32361057556,8323057.879999995


In [22]:
# Revenue made by BNPL service

secondRev = 8323057.88 * 0.0661
secondRev

550154.1258680001

In [23]:
topRev > secondRev

True

### Info on 20th merchant

In [25]:
# 20th best merchant

twentyMerchant = transactions_sdf.where(F.col('merchant_abn') == 76767266140)

In [27]:
merchant_info.where(F.col('merchant_abn') == 76767266140)

merchant_abn,name,category,revenue_level,take_rate,category_indexed,segment
76767266140,Phasellus At Limited,"[furniture, home ...",b,0.0465,11,Home


In [22]:
ttotal = twentyMerchant.count()
tmaleCount = twentyMerchant.where(F.col('gender') == "Male").count()
tfemaleCount = twentyMerchant.where(F.col('gender') == "Female").count()
tundisCount = twentyMerchant.where(F.col('gender') == "Undisclosed").count()

                                                                                

In [23]:
print(f"20th merchant male proportion: {tmaleCount/ttotal}")
print(f"20th merchant female proportion: {tfemaleCount/ttotal}")
print(f"20th merchant undisclosed proportion: {tundisCount/ttotal}")
print(f"20th merchant total count: {ttotal}")

20th merchant male proportion: 0.4524472431110302
20th merchant female proportion: 0.4479977101818844
20th merchant undisclosed proportion: 0.09955504670708543
20th merchant total count: 38431


In [26]:
# Find total revnue for merchant

twentyMerchant.groupBy(F.col('merchant_abn')).agg({
    'dollar_value': 'sum'
})

                                                                                

merchant_abn,sum(dollar_value)
76767266140,8243403.799999999


In [28]:
# Revenue made by BNPL service

twentyRev = 8243403.80 * 0.0465
twentyRev

383318.2767

In [29]:
topRev > twentyRev

True

### Info on worst ranked merchant

In [31]:
# Worst ranked merchant

badMerchant = transactions_sdf.where(F.col('merchant_abn') == 55403018592)

In [30]:
merchant_info.where(F.col('merchant_abn') == 55403018592)

merchant_abn,name,category,revenue_level,take_rate,category_indexed,segment
55403018592,Elit Limited,[antique shops - ...,b,0.0484,0,Home


In [15]:
btotal = badMerchant.count()
bmaleCount = badMerchant.where(F.col('gender') == "Male").count()
bfemaleCount = badMerchant.where(F.col('gender') == "Female").count()
bundisCount = badMerchant.where(F.col('gender') == "Undisclosed").count()

In [16]:
print(f"Worst merchant male proportion: {bmaleCount/btotal}")
print(f"Worst merchant female proportion: {bfemaleCount/btotal}")
print(f"Worst merchant undisclosed proportion: {bundisCount/btotal}")
print(f"Worst merchant total count: {btotal}")

Worst merchant male proportion: 1.0
Worst merchant female proportion: 0.0
Worst merchant undisclosed proportion: 0.0
Worst merchant total count: 1


In [32]:
# Find total revnue for merchant

badMerchant.groupBy(F.col('merchant_abn')).agg({
    'dollar_value': 'sum'
})

                                                                                

merchant_abn,sum(dollar_value)
55403018592,6667.84


In [33]:
# Revenue made by BNPL service

worstRev = 6667.84 * 0.0484
worstRev

322.723456

From these results, it is evident that our ranking system is in fact working properly - the highest ranked merchant brings the BNPL service the highest revenue, with all merchants following it returning reduced amounts of revenue. 