### File 07: GLM Regression with Tweedie

Our response data is very 0-heavy. We create a regression that accounts for this by using a Tweedie loss function. 

### Set up Spark session

We can specify more options in the SparkSession creator, but currently the options are at the default settings.

In [1]:
%%time
from pyspark.sql import SparkSession
from pyspark.sql import types as T
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import GeneralizedLinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.sql.functions import col
from pyspark.sql.functions import log
from pyspark.ml.stat import Correlation

import pandas as pd
import numpy as np
import copy

spark = SparkSession.builder \
        .appName("project") \
        .getOrCreate()

sc = spark.sparkContext

CPU times: user 407 ms, sys: 369 ms, total: 775 ms
Wall time: 4.66 s


### Read in dataframes for train and test sets

This data should have been previously generated: we can find it in the `processed_data` folder.

In [46]:
%%time
trainDF = spark.read.parquet("./processed_data/train.parquet")
testDF = spark.read.parquet("./processed_data/test.parquet")
trainDF.show(5)

+---------+-------------+------------------+------------+--------------+------------------+-----------------+----------------------------+---------------------------+----------------------------+----------------------------+------------------------+------------------------+-------------------------+------------------------+-----------+---------------+-----------+----------------------+------------------+------------------+-------------------------+---------------------+--------------------+
|  user_id|T_total_spend|       total_spend|total_events|total_sessions|avg_session_length|sd_session_length|avg_interactions_per_session|sd_interactions_per_session|max_interactions_per_session|purchase_pct_of_total_events|view_pct_of_total_events|cart_pct_of_total_events|avg_purchases_per_session|sd_purchases_per_session|cart_events|purchase_events|view_events|sessions_with_purchase|sessions_with_cart|sessions_with_view|pct_sessions_end_purchase|pct_sessions_end_cart|       pca_purchases|
+-------

In [47]:
trainDF = trainDF \
          .withColumn("total_spend_log", log(col("total_spend")+0.001)) \
          .withColumn("total_events_log", log(col("total_events")+0.001)) \
          .withColumn("purchase_events_log", log(col("purchase_events")+0.001)) \
          .withColumn("total_sessions_log", log(col("total_sessions")+0.001)) \
          .withColumn("T_total_spend_log", log(col("T_total_spend")+0.001)) \
          .withColumn("total_spend_pos", (col("total_spend")+0.001)) \
          .withColumn("total_events_pos", (col("total_events")+0.001)) \
          .withColumn("purchase_events_pos", (col("purchase_events")+0.001)) \
          .withColumn("total_sessions_pos", (col("total_sessions")+0.001)) \
          .withColumn("T_total_spend_pos", (col("T_total_spend")/10000+100))

testDF = testDF \
          .withColumn("total_spend_log", log(col("total_spend")+0.001)) \
          .withColumn("total_events_log", log(col("total_events")+0.001)) \
          .withColumn("purchase_events_log", log(col("purchase_events")+0.001)) \
          .withColumn("total_sessions_log", log(col("total_sessions")+0.001)) \
          .withColumn("T_total_spend_log", log(col("T_total_spend")+0.001)) \
          .withColumn("total_spend_pos", (col("total_spend")+0.001)) \
          .withColumn("total_events_pos", (col("total_events")+0.001)) \
          .withColumn("purchase_events_pos", (col("purchase_events")+0.001)) \
          .withColumn("total_sessions_pos", (col("total_sessions")+0.001)) \
          .withColumn("T_total_spend_pos", (col("T_total_spend")/10000+100))


### Set up Spark ML pipeline training for generalized linear regression

Here we decide which input columns should be used in order to create our training pipeline. To implement this step, we create the function `generatePipeline(inputCols, outputCol`). Then, we train the pipeline using this function.

In [4]:
%%time

inputCols = ["total_spend","total_events","purchase_events","total_sessions"]
# https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.regression.GeneralizedLinearRegression.html
# on choosing variable values: https://www.rdocumentation.org/packages/statmod/versions/1.4.36/topics/tweedie
# LinkPower: Supported variables:
# variancePower: Supported values 0 and [1, inf)

def generateGLRPipeline(inputCols, outputCol):
    # Select input columns for generalized linear regression
    vecAssembler = VectorAssembler(inputCols=inputCols, outputCol="features")
    
    # create glr instance & select output col
    glr = GeneralizedLinearRegression(featuresCol = "features", labelCol = outputCol, family = "tweedie")

    pipeline = Pipeline(stages=[vecAssembler, glr])
    return pipeline
    
pipeline = generateGLRPipeline(inputCols, "T_total_spend")
pipelineGLRModel = pipeline.fit(trainDF)

CPU times: user 11.3 ms, sys: 537 µs, total: 11.8 ms
Wall time: 1.41 s


In [5]:
def modelInfo(inputCols, pipelineGLRModel):
    # Create a zipped list containing the coefficients and the data
    modelCols = copy.deepcopy(inputCols)
    modelCoeffs = list(pipelineGLRModel.stages[-1].coefficients)
    modelCoeffs.insert(0,pipelineGLRModel.stages[-1].intercept)
    modelCols.insert(0,"intercept")
    modelZippedList = list(map(list, zip(modelCols, modelCoeffs)))

    # Create the pandas DataFrame
    modelDF = pd.DataFrame(modelZippedList, columns = ['Column name', 'Coefficient'])
    return modelDF

print("Model coefficients")
print(modelInfo(inputCols, pipelineGLRModel))


Model coefficients
       Column name  Coefficient
0        intercept  -947.418806
1      total_spend     3.052155
2     total_events   190.147287
3  purchase_events -1003.568934
4   total_sessions  -134.816314


In [6]:
def getEvaluationMetrics(pipelineModel,outputCol,testDF):
    predDF = pipelineModel.transform(testDF)
    predDF.select(outputCol, "prediction").show(10)

    regressionEvaluator = RegressionEvaluator(
    predictionCol="prediction",
    labelCol=outputCol,
    metricName="rmse")
    rmse = regressionEvaluator.evaluate(predDF)

    regressionEvaluator = RegressionEvaluator(
    predictionCol="prediction",
    labelCol=outputCol,
    metricName="r2")
    r2 = regressionEvaluator.evaluate(predDF)
    
    return rmse, r2

evaluationMetrics = getEvaluationMetrics(pipelineGLRModel,"T_total_spend",testDF)
print(f"RMSE is {evaluationMetrics[0]:.1f}")
print(f"R^2 is {evaluationMetrics[1]:.5f}")

+------------------+------------------+
|     T_total_spend|        prediction|
+------------------+------------------+
|               0.0|-253.2111381996458|
| 4938.120147705078| 5004.291378493605|
| 19596.92041015625| 4168.929223074163|
|1945.0799560546875|4024.0553435394086|
|               0.0| 3989.534344595465|
|               0.0| 640.4539007558453|
|               0.0|13551.776916856319|
|               0.0| 2203.059543324577|
|               0.0| 23894.47863125475|
|               0.0| 5520.036358128967|
+------------------+------------------+
only showing top 10 rows

RMSE is 99445.7
R^2 is 0.64077


##### Cross-evaluate tweedie variables (linkPower & varPower)

In [7]:
# For crossval to work, must define pipeline here
# Select input columns for generalized linear regression
vecAssembler = VectorAssembler(inputCols=inputCols, outputCol="features")

# create glr instance & select output col
glr = GeneralizedLinearRegression(featuresCol = "features", labelCol = "T_total_spend", family = "tweedie")

pipeline = Pipeline(stages=[vecAssembler, glr])

paramGrid = ParamGridBuilder() \
    .addGrid(glr.variancePower, [1]) \
    .addGrid(glr.linkPower, [2, 1.5]) \
    .build()

crossval = CrossValidator(estimator = pipeline,
                         estimatorParamMaps=paramGrid,
                         evaluator=RegressionEvaluator().setLabelCol("T_total_spend"),
                         numFolds=4)
# Run cross-validation, and choose best set of parameters.
cvModel = crossval.fit(trainDF)

Py4JJavaError: An error occurred while calling o356.fit.
: java.lang.AssertionError: assertion failed: Sum of weights cannot be zero.
	at scala.Predef$.assert(Predef.scala:223)
	at org.apache.spark.ml.optim.WeightedLeastSquares$Aggregator.validate(WeightedLeastSquares.scala:426)
	at org.apache.spark.ml.optim.WeightedLeastSquares.fit(WeightedLeastSquares.scala:108)
	at org.apache.spark.ml.optim.IterativelyReweightedLeastSquares.fit(IterativelyReweightedLeastSquares.scala:91)
	at org.apache.spark.ml.regression.GeneralizedLinearRegression.$anonfun$train$1(GeneralizedLinearRegression.scala:431)
	at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)
	at org.apache.spark.ml.regression.GeneralizedLinearRegression.train(GeneralizedLinearRegression.scala:379)
	at org.apache.spark.ml.regression.GeneralizedLinearRegression.train(GeneralizedLinearRegression.scala:246)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:151)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:115)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.base/java.lang.Thread.run(Thread.java:829)


In [None]:
# Troubleshooting: https://stackoverflow.com/questions/49567921/pyspark-how-to-fit-a-glm-using-log-as-link-function-with-sum-of-weights-as-zero
# Sum of weights cannot be 0 issue
# Other suggestion is data is too long-tailed, try cutting it (treating more of spend as outlier)
#trainDF = trainDF.withColumn("T_total_spend_small", col("T_total_spend")/1000) # Per one person's advice, try making the response values smaller 

# code that produces "Sum of weights cannot be zero" is here: https://github.com/apache/spark/blob/master/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala


In [49]:
# Testing param values to identify errors
#def generateGLRPipelineTEST(inputCols, outputCol, variancePower):
    # Select input columns for generalized linear regression
#    vecAssembler = VectorAssembler(inputCols=inputCols, outputCol="features")
    
#    # create glr instance & select output col
#    glr = GeneralizedLinearRegression(featuresCol = "features", labelCol = outputCol, family = "tweedie", variancePower=variancePower)

#    pipeline = Pipeline(stages=[vecAssembler, glr])
#    return pipeline
#pipeline = generateGLRPipelineTEST(inputCols, "T_total_spend", 0)
#pipelineGLRModel = pipeline.fit(trainDF)


inputCols = ["total_spend_log", "total_events_log", "purchase_events_log", "total_sessions_log", "T_total_spend_pos"]


vecAssembler = VectorAssembler(inputCols=inputCols, outputCol="features")  
glr = GeneralizedLinearRegression(featuresCol = "features", labelCol = "T_total_spend_pos", family = "tweedie", linkPower=1.1)
pipeline = Pipeline(stages=[vecAssembler, glr])
pipelineGLRModel = pipeline.fit(trainDF)

print("Model coefficients")
print(modelInfo(inputCols, pipelineGLRModel))

evaluationMetrics = getEvaluationMetrics(pipelineGLRModel,"T_total_spend_pos",testDF)
print(f"RMSE is {evaluationMetrics[0]:.1f}")
print(f"R^2 is {evaluationMetrics[1]:.5f}")

Model coefficients
           Column name  Coefficient
0            intercept   -66.398428
1      total_spend_log    -0.897743
2     total_events_log    -2.622934
3  purchase_events_log    -0.107436
4   total_sessions_log     1.636337
5    T_total_spend_pos     2.354846
+------------------+------------------+
| T_total_spend_pos|        prediction|
+------------------+------------------+
|             100.0|100.34543772831304|
|100.49381201477051| 99.48523858797137|
|101.95969204101563|102.49486938178262|
|100.19450799560546|100.46987584117399|
|             100.0|100.23957936056897|
|             100.0|100.06726431152758|
|             100.0| 98.34860980706803|
|             100.0|100.66692895482925|
|             100.0|  98.6559712473686|
|             100.0| 99.32452642091772|
+------------------+------------------+
only showing top 10 rows

RMSE is 3.4
R^2 is 0.95899
