In [0]:
spark

In [0]:
from pyspark.sql import *

In [0]:
schema = "email string, avatar string, \
                     avg_session_length double, \
                     time_on_app  double, \
                     time_on_website double, \
                     length_of_membership double, \
                     yearly_amount_spent double"

In [0]:
df = spark.read.csv(schema = schema, \
                    header = True, \
                    path = "/FileStore/tables/ecommerce/ecommerce.csv")
                    

In [0]:
df.count()

Out[10]: 500

In [0]:
df.describe().show()

+-------+-----------------+-----------+------------------+------------------+------------------+--------------------+-------------------+
|summary|            email|     avatar|avg_session_length|       time_on_app|   time_on_website|length_of_membership|yearly_amount_spent|
+-------+-----------------+-----------+------------------+------------------+------------------+--------------------+-------------------+
|  count|              500|        500|               500|               500|               500|                 500|                500|
|   mean|             null|       null|    33.05319351824|12.052487936928012|37.060445421080004|  3.5334615559298004|  499.3140382608002|
| stddev|             null|       null|0.9925631111602911|0.9942156084624618|1.0104889068105993|  0.9992775024372845|  79.31478155115914|
|    min|aaron04@yahoo.com|  AliceBlue|       29.53242897|       8.508152176|       33.91384725|          0.26990109|        256.6705823|
|    max|zscott@wright.com|YellowG

In [0]:
stringIndexerCol = ["email", "avatar"]

In [0]:
from pyspark.ml.feature import StringIndexer
email_indexer = StringIndexer(inputCol="email", outputCol="emailIndex")
df1 = email_indexer.fit(df).transform(df)

avatar_index = StringIndexer(inputCol="avatar", outputCol="avatarIndex")
df2 = avatar_index.fit(df1).transform(df1)

df2.show()

+--------------------+----------------+------------------+-----------+---------------+--------------------+-------------------+----------+-----------+
|               email|          avatar|avg_session_length|time_on_app|time_on_website|length_of_membership|yearly_amount_spent|emailIndex|avatarIndex|
+--------------------+----------------+------------------+-----------+---------------+--------------------+-------------------+----------+-----------+
|mstephenson@ferna...|          Violet|       34.49726773|12.65565115|    39.57766802|         4.082620633|         587.951054|     342.0|       96.0|
|   hduke@hotmail.com|       DarkGreen|       31.92627203|11.10946073|    37.26895887|         2.664034182|        392.2049334|     190.0|       26.0|
|    pallen@yahoo.com|          Bisque|       33.00091476|11.33027806|    37.11059744|         4.104543202|        487.5475049|     355.0|        6.0|
|riverarebecca@gma...|     SaddleBrown|       34.30555663|13.71751367|    36.72128268|        

In [0]:
df2.show(2, truncate=True)

+--------------------+---------+------------------+-----------+---------------+--------------------+-------------------+----------+-----------+
|               email|   avatar|avg_session_length|time_on_app|time_on_website|length_of_membership|yearly_amount_spent|emailIndex|avatarIndex|
+--------------------+---------+------------------+-----------+---------------+--------------------+-------------------+----------+-----------+
|mstephenson@ferna...|   Violet|       34.49726773|12.65565115|    39.57766802|         4.082620633|         587.951054|     342.0|       96.0|
|   hduke@hotmail.com|DarkGreen|       31.92627203|11.10946073|    37.26895887|         2.664034182|        392.2049334|     190.0|       26.0|
+--------------------+---------+------------------+-----------+---------------+--------------------+-------------------+----------+-----------+
only showing top 2 rows



In [0]:
split_df = df2.randomSplit([0.7, 0.3])


347
153


In [0]:
train_df = split_df[0]
test_df = split_df[1]

print(f"Training rows: {train_df.count()} and Testing rows: {test_df.count()}")

Training rows: 347 and Testing rows: 153


In [0]:
from pyspark.ml.feature import *
from pyspark.sql.functions import *

assembler = VectorAssembler(inputCols = ["emailIndex", "avatarIndex", "avg_session_length", "time_on_app", "time_on_website", "length_of_membership"], outputCol = "features")
training_df = assembler.transform(train_df)
training_df = training_df.select("features", col("yearly_amount_spent").alias("label"))
training_df.show()

+--------------------+-----------+
|            features|      label|
+--------------------+-----------+
|[0.0,68.0,33.7051...|521.2407802|
|[1.0,12.0,32.4495...|503.9783791|
|[2.0,101.0,33.452...|576.4776072|
|[6.0,17.0,32.8487...|404.8245289|
|[7.0,65.0,32.6027...|482.1449969|
|[8.0,133.0,32.291...|494.5518611|
|[9.0,99.0,33.5030...|419.9387748|
|[11.0,71.0,32.693...|501.9282649|
|[12.0,36.0,32.836...|256.6705823|
|[14.0,48.0,32.887...| 684.163431|
|[15.0,105.0,33.63...|  497.81193|
|[17.0,49.0,33.871...|637.1024479|
|[18.0,24.0,34.188...| 583.977802|
|[19.0,48.0,32.063...|378.3309069|
|[20.0,0.0,32.0961...|375.3984554|
|[22.0,76.0,32.959...| 448.340425|
|[23.0,19.0,33.992...|492.6060127|
|[24.0,44.0,34.501...| 584.105885|
|[25.0,89.0,32.657...|516.8315567|
|[26.0,53.0,32.133...|443.4418601|
+--------------------+-----------+
only showing top 20 rows



In [0]:
from pyspark.ml.regression import *

lr = LinearRegression(featuresCol = "features", labelCol = "label", maxIter = 10, regParam = 0.3)
model = lr.fit(training_df)

print(model)

LinearRegressionModel: uid=LinearRegression_35d27579c3c6, numFeatures=6


In [0]:
model.summary.tValues

Out[67]: [0.6880965531459844,
 0.862715903741891,
 46.64447395044461,
 69.46781860660127,
 1.4463291907903089,
 112.6218629242398,
 -38.21732693028442]

In [0]:
testing_df = assembler.transform(test_df)
testing_df = testing_df.select("features", col("yearly_amount_spent").alias("true_label"))
testing_df.show()

+--------------------+-----------+
|            features| true_label|
+--------------------+-----------+
|[3.0,18.0,31.4474...|418.6027421|
|[4.0,21.0,32.4256...|420.7376732|
|[5.0,69.0,33.5477...|476.1914133|
|[10.0,1.0,32.1878...|452.3156755|
|[13.0,0.0,31.9673...|445.7498412|
|[16.0,72.0,31.954...|439.9978799|
|[21.0,2.0,33.9252...| 483.673308|
|[27.0,58.0,34.606...|402.1671222|
|[30.0,34.0,33.700...|492.5568337|
|[31.0,26.0,32.351...|532.9352188|
|[37.0,10.0,31.673...|475.7250679|
|[39.0,7.0,34.3307...|558.4272572|
|[41.0,22.0,33.811...|535.3216101|
|[42.0,115.0,32.04...|497.3895578|
|[46.0,100.0,33.35...|549.0082269|
|[50.0,25.0,33.471...|515.8288149|
|[56.0,24.0,31.936...|427.1993849|
|[62.0,63.0,33.900...|442.6672517|
|[63.0,28.0,34.695...|510.4013885|
|[65.0,27.0,32.175...|588.7126055|
+--------------------+-----------+
only showing top 20 rows



In [0]:
prediction = model.transform(testing_df)

In [0]:
prediction.show()

+--------------------+-----------+------------------+
|            features| true_label|        prediction|
+--------------------+-----------+------------------+
|[3.0,18.0,31.4474...|418.6027421| 424.6516713734743|
|[4.0,21.0,32.4256...|420.7376732| 401.3139222580028|
|[5.0,69.0,33.5477...|476.1914133|480.97626022798204|
|[10.0,1.0,32.1878...|452.3156755|457.32446522006694|
|[13.0,0.0,31.9673...|445.7498412| 449.5065740875873|
|[16.0,72.0,31.954...|439.9978799|431.23061812563196|
|[21.0,2.0,33.9252...| 483.673308| 492.6302668467058|
|[27.0,58.0,34.606...|402.1671222|424.11410711909457|
|[30.0,34.0,33.700...|492.5568337|  500.209598389077|
|[31.0,26.0,32.351...|532.9352188| 526.7961241912469|
|[37.0,10.0,31.673...|475.7250679|500.85814749702695|
|[39.0,7.0,34.3307...|558.4272572| 557.2574712595822|
|[41.0,22.0,33.811...|535.3216101| 530.7181993688987|
|[42.0,115.0,32.04...|497.3895578| 480.7376487669892|
|[46.0,100.0,33.35...|549.0082269| 549.9360657079917|
|[50.0,25.0,33.471...|515.82

In [0]:
prediction.createOrReplaceTempView("ecommerce_prediction")

In [0]:
%sql
select prediction, true_label from ecommerce_prediction;

prediction,true_label
424.6516713734743,418.6027421
401.3139222580028,420.7376732
480.9762602279821,476.1914133
457.3244652200669,452.3156755
449.5065740875873,445.7498412
431.230618125632,439.9978799
492.6302668467058,483.673308
424.11410711909457,402.1671222
500.209598389077,492.5568337
526.7961241912469,532.9352188


Output can only be rendered in Databricks

In [0]:
from pyspark.ml.evaluation import *

evaluator = RegressionEvaluator(predictionCol = 'prediction', \
                            labelCol = 'true_label', \
                            metricName = 'rmse')

rmse = evaluator.evaluate(prediction)
print(f"Root Mean Square Error: {rmse}")

Root Mean Square Error: 9.910798714615945
