In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import functions as F
from pyspark.ml.feature import Imputer
import numpy as np
import pandas as pd

spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.driver.memory", '8g')
    .config("spark.executor.memory", '8g')
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.sql.parquet.enableVectorizedReader","false")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .getOrCreate()
)

22/10/05 21:58:05 WARN Utils: Your hostname, DESKTOP-3NQ3PQI resolves to a loopback address: 127.0.1.1; using 172.17.23.167 instead (on interface eth0)
22/10/05 21:58:05 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/05 21:58:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Read in data

In [2]:
weights_sdf = spark.read.parquet(
    '../data/curated/demographic_weights.parquet'
)
consumers_sdf = spark.read.parquet(
    '../data/curated/cleaned_consumers.parquet'
)

user_details_sdf = spark.read.parquet(
    '../data/tables/consumer_user_details.parquet'
)

merchants_sdf = spark.read.parquet(
    '../data/curated/merchants.parquet'
)

consumer_fraud_prob_sdf = spark.read.option('header', True).csv(
    '../data/tables/consumer_fraud_probability.csv'
).withColumn(
    'order_datetime',
    F.to_date('order_datetime')
).withColumn(
    'fraud_probability',
    F.col('fraud_probability')/100
).withColumnRenamed(
    'fraud_probability',
    'consumer_fraud_prob'
)

merchant_fraud_prob_sdf = spark.read.option('header', True).csv(
    '../data/tables/merchant_fraud_probability.csv'
).withColumn(
    'order_datetime',
    F.to_date('order_datetime')
).withColumn(
    'fraud_probability',
    F.col('fraud_probability')/100
).withColumnRenamed(
    'fraud_probability',
    'merchant_fraud_prob'
)

postcode_poa_sdf = spark.read.parquet(
    '../data/curated/census/postcode_poa.parquet'
)

transactions_sdf = spark.read.parquet(
    '../data/tables/transactions_20210228_20210827_snapshot'
).union(
    spark.read.parquet(
        '../data/tables/transactions_20210828_20220227_snapshot'
    )
).union(
    spark.read.parquet(
        '../data/tables/transactions_20220228_20220828_snapshot'
    )
)



                                                                                

### Remove transactions outside valid bnpl range

In [3]:
print('Total transactions:')
transactions_sdf.count()

Total transactions:


                                                                                

14195505

In [4]:
# Round to 2 decimal places, and define a transaction range
min_value = 10
max_value = 10000
pre_count = transactions_sdf.count()

transactions_sdf = transactions_sdf.where(
    (F.col('dollar_value') >= min_value)
    & (F.col('dollar_value') <= max_value)
)

print('Total removed:')
pre_count - transactions_sdf.count()

                                                                                

Total removed:


                                                                                

1363555

### Join transaction data with consumer data and weights

In [5]:
# Join with consumer data (weights and fraud probabilities)
transactions_sdf = transactions_sdf.join(
    user_details_sdf,
    on = 'user_id',
    how = 'left'
).join(
    consumers_sdf.select(
        'consumer_id', 'postcode', 'gender'
    ),
    on = 'consumer_id',
    how = 'left'
).join(
    postcode_poa_sdf,
    on = 'postcode',
    how = 'left'
).join(
    weights_sdf,
    on = ['poa' ,'gender'],
    how = 'left'
).join(
    consumer_fraud_prob_sdf,
    on = ['user_id', 'order_datetime'],
    how = 'left'
).na.fill(
    0, 
    subset = 'consumer_fraud_prob'
)

# Impute null weights with column average
imputer = Imputer(inputCol = 'weight', outputCol='weight', strategy = 'mean')
transactions_sdf = imputer.fit(transactions_sdf).transform(transactions_sdf)

# Apply transaction level weighting
transactions_sdf = transactions_sdf.withColumn(
    'weighted_dollar_value',
    F.col('dollar_value')*F.col('weight')*(1 - F.col('consumer_fraud_prob'))
)

# Group by merchant and day and join with merchant data
transactions_sdf = transactions_sdf.groupby(
    'merchant_abn', 'order_datetime'
).agg(
    F.sum('weighted_dollar_value').alias('weighted_dollar_value'),
    F.sum('dollar_value').alias('dollar_value')
).join(
    merchants_sdf.select(
        'merchant_abn', 'take_rate'
    ),
    on = 'merchant_abn',
).join(
    merchant_fraud_prob_sdf,
    on = ['merchant_abn', 'order_datetime'],
    how = 'left'
).na.fill(
    0,
    subset = 'merchant_fraud_prob'
)

# Apply merchant level weighting
transactions_sdf = transactions_sdf.withColumn(
    'weighted_dollar_value',
    F.col('weighted_dollar_value')*F.col('take_rate')*(1 - F.col('merchant_fraud_prob'))
).select(
    'merchant_abn', 'order_datetime', 'weighted_dollar_value', 'dollar_value'
)

                                                                                

In [6]:
transactions_df = transactions_sdf.toPandas()

                                                                                

In [7]:
time_steps = transactions_df[['order_datetime']].drop_duplicates()
merchants = transactions_df[['merchant_abn']].drop_duplicates()
time_steps['key'] = 1
merchants['key'] = 1
merchant_time_steps = pd.merge(
    merchants,
    time_steps,
    on = 'key'
).drop('key', axis = 1)

transactions_df = pd.merge(
    transactions_df,
    merchant_time_steps,
    on = ['merchant_abn', 'order_datetime'],
    how = 'outer'
).fillna(0)

In [8]:
transactions_df['week_idx'] = (
    (
        transactions_df['order_datetime'] - transactions_df['order_datetime'].min()
    )/np.timedelta64(1, 'W')
).astype(int)

transactions_df = transactions_df[transactions_df['week_idx'] != transactions_df['week_idx'].max()]

In [9]:
transactions_df = transactions_df.groupby(
    ['merchant_abn', 'week_idx']
).agg({'order_datetime' : 'min', 'weighted_dollar_value' : 'sum', 'dollar_value' : 'sum'}).reset_index()

In [10]:
transactions_df['order_datetime'] = pd.to_datetime(transactions_df['order_datetime'])
transactions_df['week_of_year'] = transactions_df['order_datetime'].dt.isocalendar().week.astype(str)

### Write to disk

In [11]:
transactions_df.to_parquet('../data/curated/weighted_transactions.parquet')