## Gradient Boosted Tree

In [1]:
%%time
from pyspark.sql import SparkSession
from pyspark.sql import types as T
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import GBTRegressor
from pyspark.mllib.util import MLUtils
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.sql.functions import col
from pyspark.sql.functions import log
from pyspark.ml.stat import Correlation


import pandas as pd
import numpy as np
import copy

spark = SparkSession.builder \
        .appName("project") \
        .getOrCreate()

sc = spark.sparkContext

CPU times: user 531 ms, sys: 407 ms, total: 939 ms
Wall time: 5.77 s


### Read in dataframes for train and test sets

This data should have been previously generated: we can find it in the `processed_data` folder.

In [2]:
%%time
trainDF = spark.read.parquet("./processed_data/train.parquet")
testDF = spark.read.parquet("./processed_data/test.parquet")
trainDF.show(5)

+---------+------------------+-----------------+------------+--------------+------------------+------------------+----------------------------+---------------------------+----------------------------+----------------------------+------------------------+------------------------+-------------------------+------------------------+-----------+---------------+-----------+----------------------+------------------+------------------+-------------------------+---------------------+--------------------+--------------------+--------------------+--------------------+
|  user_id|     T_total_spend|      total_spend|total_events|total_sessions|avg_session_length| sd_session_length|avg_interactions_per_session|sd_interactions_per_session|max_interactions_per_session|purchase_pct_of_total_events|view_pct_of_total_events|cart_pct_of_total_events|avg_purchases_per_session|sd_purchases_per_session|cart_events|purchase_events|view_events|sessions_with_purchase|sessions_with_cart|sessions_with_view|pct_s

In [3]:
## What the heck, we've used these columns in our previous models, might as well see how they do here. 

trainDF = trainDF \
          .withColumn("total_spend_log", log(col("total_spend")+0.001)) \
          .withColumn("total_events_log", log(col("total_events")+0.001)) \
          .withColumn("purchase_events_log", log(col("purchase_events")+0.001)) \
          .withColumn("total_sessions_log", log(col("total_sessions")+0.001)) \
          .withColumn("T_total_spend_log", log(col("T_total_spend")+0.001))

testDF = testDF \
          .withColumn("total_spend_log", log(col("total_spend")+0.001)) \
          .withColumn("total_events_log", log(col("total_events")+0.001)) \
          .withColumn("purchase_events_log", log(col("purchase_events")+0.001)) \
          .withColumn("total_sessions_log", log(col("total_sessions")+0.001)) \
          .withColumn("T_total_spend_log", log(col("T_total_spend")+0.001))

#trainDF.show(2)
#testDF.show(2)


### Set up Spark ML pipeline training for random forest

We create the function `generatePipeline(inputCols, outputCol)`, Then, we train the pipeline using this function. 

In [4]:
%%time

def generatePipeline(inputCols, outputCol):
    
    # Select input columns for random forest regression
    vecAssembler = VectorAssembler(inputCols=inputCols, outputCol="features")

    # Select output column for random forest regression
    gb = GBTRegressor(featuresCol="features", labelCol=outputCol, seed = 42)#, numTrees=5, maxDepth=5)
    
    pipeline = Pipeline(stages=[vecAssembler, gb])
    return pipeline



CPU times: user 2 µs, sys: 2 µs, total: 4 µs
Wall time: 7.15 µs


In [5]:
# Calculate adjusted r2 (https://towardsdatascience.com/machine-learning-linear-regression-using-pyspark-9d5d5c772b42)
def adj_r2(r2, inputCols, testDF):
    n = testDF.count()
    p = len(inputCols)
    adjusted_r2 = 1-(((1-r2)*(n-1))/(n-p-1))
    return adjusted_r2

In [6]:
def getEvaluationMetrics(pipelineMode,outputCol,testDF,inputCols):
    predDF = pipelineModel.transform(testDF)
    predDF.select(outputCol, "prediction").show(10)
    
    regressionEvaluator = RegressionEvaluator(
    predictionCol="prediction",
    labelCol=outputCol,
    metricName="rmse")
    rmse = regressionEvaluator.evaluate(predDF)

    regressionEvaluator = RegressionEvaluator(
    predictionCol="prediction",
    labelCol=outputCol,
    metricName="r2")
    r2 = regressionEvaluator.evaluate(predDF)
    
    # Manually calculate Adjusted r2
    adjusted_r2 = adj_r2(r2, inputCols, testDF)
    
    return rmse, r2, adjusted_r2



In [7]:
def modelInfo(inputCols, pipelineModel):
    modelCols = pipelineModel.stages[-2].getInputCols()
    
    feature_importance = pipelineModel.stages[-1].featureImportances
    
    return pd.DataFrame(list(zip(modelCols, feature_importance)), columns = ['Column name', 'Importance']).sort_values(by="Importance", ascending = False)

In [8]:
print("** All normal inputs, normal output **")
inputCols = ["total_spend","total_events","purchase_events", "total_sessions", "avg_session_length", "avg_interactions_per_session", "max_interactions_per_session",
             "purchase_pct_of_total_events", "view_pct_of_total_events", "cart_pct_of_total_events","avg_purchases_per_session", "cart_events", "purchase_events",
             "view_events", "sessions_with_purchase", "sessions_with_cart","sessions_with_view", "pct_sessions_end_purchase", "pct_sessions_end_cart", 'sd_session_length', 
             'sd_interactions_per_session', 'sd_purchases_per_session']

outputCol = "T_total_spend"

pipeline = generatePipeline(inputCols, outputCol)
pipelineModel = pipeline.fit(trainDF)

print(modelInfo(inputCols, pipelineModel))

evaluationMetrics = getEvaluationMetrics(pipelineModel,outputCol,testDF, inputCols)
print(f"RMSE is {evaluationMetrics[0]:.1f}")
print(f"R^2 is {evaluationMetrics[1]:.5f}")
print(f"Adjusted R^2 is {evaluationMetrics[2]:.5f}")

** All normal inputs, normal output **
                     Column name  Importance
1                   total_events    0.410232
0                    total_spend    0.165761
7   purchase_pct_of_total_events    0.095895
18         pct_sessions_end_cart    0.030591
8       view_pct_of_total_events    0.030071
14        sessions_with_purchase    0.028706
11                   cart_events    0.027229
4             avg_session_length    0.022188
2                purchase_events    0.021341
6   max_interactions_per_session    0.020568
13                   view_events    0.020451
15            sessions_with_cart    0.019382
5   avg_interactions_per_session    0.017602
19             sd_session_length    0.016792
21      sd_purchases_per_session    0.016539
10     avg_purchases_per_session    0.015044
9       cart_pct_of_total_events    0.014014
20   sd_interactions_per_session    0.012699
17     pct_sessions_end_purchase    0.006809
3                 total_sessions    0.004292
16            se

In [10]:
print("** All normal inputs + log inputs (25 total), normal output **")
inputCols = ["total_spend","total_events","purchase_events", "total_sessions", "avg_session_length", "avg_interactions_per_session", "max_interactions_per_session",
             "purchase_pct_of_total_events", "view_pct_of_total_events", "cart_pct_of_total_events","avg_purchases_per_session", "cart_events", 
             "view_events", "sessions_with_purchase", "sessions_with_cart","sessions_with_view", "pct_sessions_end_purchase", "pct_sessions_end_cart", 'sd_session_length', 
             'sd_interactions_per_session', 'sd_purchases_per_session', 'total_spend_log', 'total_events_log', 'purchase_events_log', 'total_sessions_log']

outputCol = "T_total_spend"

pipeline = generatePipeline(inputCols, outputCol)
pipelineModel = pipeline.fit(trainDF)

print(modelInfo(inputCols, pipelineModel))

evaluationMetrics = getEvaluationMetrics(pipelineModel,outputCol,testDF, inputCols)
print(f"RMSE is {evaluationMetrics[0]:.1f}")
print(f"R^2 is {evaluationMetrics[1]:.5f}")
print(f"Adjusted R^2 is {evaluationMetrics[2]:.5f}")

** All normal inputs + log inputs (26 total), normal output **
                     Column name  Importance
1                   total_events    0.398364
0                    total_spend    0.165765
7   purchase_pct_of_total_events    0.096189
8       view_pct_of_total_events    0.030199
13        sessions_with_purchase    0.028707
11                   cart_events    0.027228
17         pct_sessions_end_cart    0.023283
4             avg_session_length    0.022235
2                purchase_events    0.021341
9       cart_pct_of_total_events    0.021194
12                   view_events    0.020450
6   max_interactions_per_session    0.019676
14            sessions_with_cart    0.019614
5   avg_interactions_per_session    0.017555
18             sd_session_length    0.016560
20      sd_purchases_per_session    0.016539
10     avg_purchases_per_session    0.015037
19   sd_interactions_per_session    0.013591
22              total_events_log    0.011577
16     pct_sessions_end_purchase    0

#### How interesting! Three of the four log values wound up with importances of 0. How does a log output work for this one?

In [11]:
print("** All normal inputs + log inputs (25 total), log output **")
inputCols = ["total_spend","total_events","purchase_events", "total_sessions", "avg_session_length", "avg_interactions_per_session", "max_interactions_per_session",
             "purchase_pct_of_total_events", "view_pct_of_total_events", "cart_pct_of_total_events","avg_purchases_per_session", "cart_events", 
             "view_events", "sessions_with_purchase", "sessions_with_cart","sessions_with_view", "pct_sessions_end_purchase", "pct_sessions_end_cart", 'sd_session_length', 
             'sd_interactions_per_session', 'sd_purchases_per_session', 'total_spend_log', 'total_events_log', 'purchase_events_log', 'total_sessions_log']

outputCol = "T_total_spend_log"

pipeline = generatePipeline(inputCols, outputCol)
pipelineModel = pipeline.fit(trainDF)

print(modelInfo(inputCols, pipelineModel))

evaluationMetrics = getEvaluationMetrics(pipelineModel,outputCol,testDF, inputCols)
print(f"RMSE is {evaluationMetrics[0]:.1f}")
print(f"R^2 is {evaluationMetrics[1]:.5f}")
print(f"Adjusted R^2 is {evaluationMetrics[2]:.5f}")

** All normal inputs + log inputs (25 total), normal output **
                     Column name  Importance
1                   total_events    0.353249
12                   view_events    0.291121
0                    total_spend    0.161128
11                   cart_events    0.073254
22              total_events_log    0.030372
2                purchase_events    0.025939
8       view_pct_of_total_events    0.022107
7   purchase_pct_of_total_events    0.011118
4             avg_session_length    0.011114
9       cart_pct_of_total_events    0.004055
5   avg_interactions_per_session    0.003885
6   max_interactions_per_session    0.002780
10     avg_purchases_per_session    0.002257
15            sessions_with_view    0.001686
13        sessions_with_purchase    0.001406
20      sd_purchases_per_session    0.001217
3                 total_sessions    0.001124
14            sessions_with_cart    0.000892
18             sd_session_length    0.000713
19   sd_interactions_per_session    0

#### Dang! The log output actually worked significantly better this time. We'll use that for the remainder. Time for tuning...

In [72]:
print("** Smaller model, log output **")
inputCols = ["total_spend","total_events","cart_events", "view_events"]

outputCol = "T_total_spend_log"

pipeline = generatePipeline(inputCols, outputCol)
pipelineModel = pipeline.fit(trainDF)

print(modelInfo(inputCols, pipelineModel))

evaluationMetrics = getEvaluationMetrics(pipelineModel,outputCol,testDF, inputCols)
print(f"RMSE is {evaluationMetrics[0]:.1f}")
print(f"R^2 is {evaluationMetrics[1]:.5f}")
print(f"Adjusted R^2 is {evaluationMetrics[2]:.5f}")

** Smaller model, log output **
    Column name  Importance
1  total_events    0.431924
3   view_events    0.283415
0   total_spend    0.181242
2   cart_events    0.103418
+------------------+------------------+
| T_total_spend_log|        prediction|
+------------------+------------------+
|-6.907755278982137|-5.521857618476705|
|-6.907755278982137|-5.277740003874514|
|-6.907755278982137|-4.584288139916862|
|-6.907755278982137|-5.846964864405892|
|-6.907755278982137|-5.403275607729301|
|-6.907755278982137|-5.625976333857803|
|-6.907755278982137|-4.257682714735065|
|-6.907755278982137|-5.500642569113173|
|11.278695703691795|12.321049141058792|
|6.4970781070591626| 8.320510336325208|
+------------------+------------------+
only showing top 10 rows

RMSE is 4.5
R^2 is 0.60170
Adjusted R^2 is 0.60167


#### The champion model with the best adjusted r-square (.60167) has only 4 input columns, and uses the log output. But is it using the best hyperparameters? 

In [84]:
%%time
inputCols = ["total_spend","total_events","cart_events", "view_events"]

outputCol = "T_total_spend_log"

pipeline = generatePipeline(inputCols, outputCol)
pipelineModel = pipeline.fit(trainDF)

vecAssembler = VectorAssembler(inputCols=inputCols, outputCol="features")

gb = GBTRegressor(featuresCol="features", labelCol=outputCol, seed = 42)

pipeline = Pipeline(stages=[vecAssembler, gb])

paramGrid = ParamGridBuilder() \
    .addGrid(gb.lossType, ["squared", "absolute"]) \
    .addGrid(gb.maxIter, [10, 20, 30]) \
    .addGrid(gb.maxDepth, [2, 5, 10]) \
    .addGrid(gb.stepSize, [.01, .05, .1, .2, .5]) \
    .addGrid(gb.featureSubsetStrategy, ['sqrt', 'log2', 'onethird']) \
    .build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=RegressionEvaluator().setLabelCol("T_total_spend"),
                          numFolds=4)

CPU times: user 23 ms, sys: 9.24 ms, total: 32.3 ms
Wall time: 7.72 s


In [None]:
%%time
cvModel = crossval.fit(trainDF)

In [None]:
print()
print(cvModel.getEstimatorParamMaps()[np.argmax(cvModel.avgMetrics)])

evaluationMetrics = getEvaluationMetrics(cvModel.bestModel,"T_total_spend",testDF,inputCols)
print(f"RMSE is {evaluationMetrics[0]:.1f}")
print(f"R^2 is {evaluationMetrics[1]:.5f}")
print(f"Adjusted R^2 is {evaluationMetrics[2]:.5f}")