In [2]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import StringType
import pandas as pd
import numpy as np
import geopandas as gpd
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, Interaction
from pyspark.sql.functions import col
from pyspark.ml.regression import LinearRegression, RandomForestRegressor, GBTRegressor, FMRegressor
from pyspark.ml.feature import VectorIndexer
import random


In [3]:
spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.driver.memory", '8g')
    .config("spark.executor.memory", '8g')
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.sql.parquet.enableVectorizedReader","false")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .getOrCreate()
)


22/10/09 22:03:01 WARN Utils: Your hostname, DESKTOP-3NQ3PQI resolves to a loopback address: 127.0.1.1; using 172.17.27.14 instead (on interface eth0)
22/10/09 22:03:01 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/09 22:03:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [None]:
transactions_sdf  = spark.read.parquet('../data/curated/weighted_transactions.parquet')


In [7]:
transactions_df = pd.read_parquet('../data/curated/weighted_transactions.parquet')
merchant_df = pd.read_parquet('../data/curated/merchants.parquet')
segments_df = pd.read_csv('../data/curated/segments.csv')

0     ['antique shops - sales', 'repairs', 'restorat...
1                          ['art dealers', 'galleries']
2                      ['artist supply', 'craft shops']
3                  ['bicycle shops - sales', 'service']
4                ['books', 'periodicals', 'newspapers']
5     ['cable', 'satellite', 'other pay television',...
6     ['computer programming', 'data processing', 'i...
7     ['computers', 'computer peripheral equipment',...
8           ['digital goods: books', 'movies', 'music']
9     ['equipment', 'tool', 'furniture', 'appliance ...
10    ['florists supplies', 'nursery stock', 'flowers']
11    ['furniture', 'home furnishings', 'equipment s...
12        ['gift', 'card', 'novelty', 'souvenir shops']
13                            ['health', 'beauty spas']
14                       ['hobby', 'toy', 'game shops']
15    ['jewelry', 'watch', 'clock', 'silverware shops']
16    ['lawn', 'garden supply outlets', 'including n...
17              ['motor vehicle supplies', 'new 

In [None]:
transactions_sdf.limit(5)

In [None]:
def regression_model(df,indexCols,categoricalCols,continuousCols, interactionCols, labelCol):
    """
    Creates a linear regression model for the specified input column types, interaction columns and label column
    """

    
    indexers = [ StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c), handleInvalid = "skip")
                 for c in categoricalCols ]

    encoders = [ OneHotEncoder(inputCol=indexer.getOutputCol(),
                 outputCol="{0}_encoded".format(indexer.getOutputCol()))
                 for indexer in indexers ]
    
    interactions = [
        Interaction(
            inputCols = [
            f'{col}_indexed_encoded'
            for col in interactionCol
        ],
            outputCol = '_'.join(interactionCol)
        ) 
        for interactionCol in interactionCols
    ]
    
    assembler = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders]
                                + [interaction.getOutputCol() for interaction in interactions]
                                + continuousCols, outputCol="features")

    pipeline = Pipeline(
        stages=
        indexers 
        + encoders 
        + interactions 
        + [assembler]
    )

    model=pipeline.fit(df)
    data = model.transform(df)

    data = data.withColumn('label',F.col(labelCol))

    tf_sdf = data.select(indexCols +['features','label'])

    featureIndexer = VectorIndexer(inputCol="features", \
                               outputCol="indexedFeatures").fit(tf_sdf)

    tf_sdf = featureIndexer.transform(tf_sdf)

    training_sdf = tf_sdf.where(
        F.col('datetime') < '2019-06-01'
    )

    testing_sdf = tf_sdf.where(
        F.col('datetime') >= '2019-06-01'
    )

    lr = LinearRegression(regParam=0.3)

    pipeline = Pipeline(stages=[featureIndexer, lr])
    
    return pipeline.fit(training_sdf).transform(testing_sdf)

In [None]:
predictions = regression_model(transactions_sdf, ['merchant_abn', 'week_idx'], ['merchant_abn', 'week_of_year'], [], [['merchant_abn', 'week_of_year']], 'weighted_dollar_value')