In [1]:
from pyspark.sql import SparkSession, functions as F
import pandas as pd
import numpy as np
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, Interaction
from pyspark.sql.functions import col
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorIndexer
import plotly.express as px
import datetime

In [2]:
spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.driver.memory", '8g')
    .config("spark.executor.memory", '8g')
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.sql.parquet.enableVectorizedReader","false")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .getOrCreate()
)


22/10/10 00:35:38 WARN Utils: Your hostname, DESKTOP-3NQ3PQI resolves to a loopback address: 127.0.1.1; using 172.17.27.14 instead (on interface eth0)
22/10/10 00:35:38 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/10 00:35:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
transactions_df = pd.read_parquet('../data/curated/weighted_monthly_transactions.parquet')
merchant_df = pd.read_parquet('../data/curated/merchants.parquet')
segments_df = pd.read_csv('../data/curated/segments.csv')

In [4]:

prediction_input_df = pd.DataFrame({'month' : [10,11,12,1,2,3,4,5,6,7,8,9], 'year' : [2022]*3 + [2023]*9})
prediction_input_df['key'] = 1

merchants = transactions_df[['merchant_abn']].drop_duplicates()
merchants['key'] = 1

prediction_input_df = pd.merge(
    merchants,
    prediction_input_df,
    on = 'key'
).drop(columns = 'key')

prediction_input_df['november'] = prediction_input_df['month'].apply(lambda x : 1 if x == 11 else 0)
prediction_input_df['december'] = prediction_input_df['month'].apply(lambda x : 1 if x == 12 else 0)
prediction_input_df['weighted_dollar_value'] = 0

model_input_df = pd.concat(
    [transactions_df, prediction_input_df]
)

In [5]:
def regression_model(df,indexCols,categoricalCols,continuousCols, interactionCols, labelCol):
    """
    Creates a linear regression model for the specified input column types, interaction columns and label column
    """

    
    indexers = [ StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c), handleInvalid = "skip")
                 for c in categoricalCols ]

    encoders = [ OneHotEncoder(inputCol=indexer.getOutputCol(),
                 outputCol="{0}_encoded".format(indexer.getOutputCol()))
                 for indexer in indexers ]
    
    interactions = [
        Interaction(
            inputCols = [
            f'{col}_indexed_encoded'
            for col in interactionCol
        ],
            outputCol = '_'.join(interactionCol)
        ) 
        for interactionCol in interactionCols
    ]
    
    interactions += [
        Interaction(
            inputCols = ['merchant_abn_indexed_encoded', 'month'],
            outputCol = 'merchant_abn_month'
        )
    ]

    assembler = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders]
                                + [interaction.getOutputCol() for interaction in interactions]
                                + continuousCols, outputCol="features")

    pipeline = Pipeline(
        stages=
        indexers 
        + encoders 
        + interactions 
        + [assembler]
    )

    model=pipeline.fit(df)
    data = model.transform(df)

    data = data.withColumn('label',F.col(labelCol))

    tf_sdf = data.select(indexCols +['features','label'])

    featureIndexer = VectorIndexer(inputCol="features", \
                               outputCol="indexedFeatures").fit(tf_sdf)

    tf_sdf = featureIndexer.transform(tf_sdf)

    training_sdf = tf_sdf.where(
        (F.col('year') == 2021)|((F.col('year') == 2022)&(F.col('month') <= 9))
    )

    new_sdf = tf_sdf.where(
        ((F.col('year') == 2022)&(F.col('month') > 9))|(F.col('year') == 2023)
    )


    lr = LinearRegression(regParam=0.3)

    pipeline = Pipeline(stages=[featureIndexer, lr])
    
    return pipeline.fit(training_sdf).transform(new_sdf)

In [6]:
indexCols = ['merchant_abn', 'year', 'month']
categoricalCols = ['merchant_abn', 'november', 'december']
continuousCols = ['month']
interactionCols = [['merchant_abn', 'november'],['merchant_abn', 'december']]
labelCol = 'weighted_dollar_value'
predictions = regression_model(spark.createDataFrame(model_input_df), indexCols, categoricalCols, continuousCols, interactionCols, labelCol)

                                                                                

22/10/10 00:36:35 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB




22/10/10 00:37:34 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB


                                                                                

22/10/10 00:37:37 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB




22/10/10 00:37:39 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB


                                                                                

22/10/10 00:37:40 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB


[Stage 13:>                                                         (0 + 8) / 8]

22/10/10 00:37:42 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
22/10/10 00:37:42 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
22/10/10 00:37:42 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:37:43 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS


                                                                                

22/10/10 00:37:43 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
22/10/10 00:37:43 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:37:43 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:37:43 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:37:44 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:37:44 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:37:44 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:37:44 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:37:44 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:37:45 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:37:45 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:37:45 WARN DAGScheduler:

                                                                                

22/10/10 00:37:48 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:37:48 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:37:49 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:37:49 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:37:49 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB




22/10/10 00:37:50 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:37:50 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB


                                                                                

22/10/10 00:37:50 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:37:51 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:37:51 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:37:51 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:37:51 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:37:52 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:37:52 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:37:52 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:37:52 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:37:52 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:37:52 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:37:53 WARN DAGScheduler: Broadcasting larg

                                                                                

22/10/10 00:37:56 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB


[Stage 57:>                                                         (0 + 7) / 8]

22/10/10 00:37:57 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:37:57 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB


                                                                                

22/10/10 00:37:58 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:37:58 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:37:58 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:37:58 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:37:58 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:37:59 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:37:59 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:37:59 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:37:59 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB


                                                                                

22/10/10 00:38:00 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:38:00 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:38:01 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:38:01 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:38:01 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:38:01 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:38:01 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:38:02 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:38:02 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:38:02 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:38:02 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:38:02 WARN DAGScheduler: Broadcasting larg

                                                                                

22/10/10 00:38:04 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:38:04 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:38:04 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:38:04 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:38:05 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:38:05 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:38:05 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:38:05 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:38:05 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:38:05 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:38:06 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/10 00:38:06 WARN DAGScheduler: Broadcasting larg



22/10/10 00:38:10 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB


                                                                                

In [7]:
predictions_df = predictions.select('merchant_abn', 'year', 'month', 'label', 'prediction').toPandas()

                                                                                

In [10]:
predictions_df['order_datetime'] = predictions_df.apply(lambda x : datetime.datetime(int(x['year']), int(x['month']), 1), axis = 1) 
predictions_df = predictions_df.rename(columns = {'prediction' : 'weighted_dollar_value'})
predictions_df['type'] = 'prediction'

transactions_df['type'] = 'training'
model_output_df = pd.concat([transactions_df, predictions_df])

In [14]:
for abn in predictions_df['merchant_abn'].unique()[20:30]:
    px.line(model_output_df.query(f'merchant_abn == {abn}'), x = 'order_datetime', y = 'weighted_dollar_value', color = 'type').show()

In [15]:
predictions_df.to_parquet('../data/curated/lr_transaction_predictions.parquet')