In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression

In [2]:
spark= SparkSession.builder.appName('linearr').getOrCreate()

I will predict Yearly Amount Spent according to Avg Session Length Time on App Time on Website Length of Membership with Linear Regression model.

In [3]:
data = spark.read.csv('Ecommerce_Customers.csv', 
                                 header=True , inferSchema=True)

In [4]:
trainDF,testDF=data.randomSplit([0.7,0.3])

In [5]:
from pyspark.ml.feature import VectorAssembler


In [6]:
data.show(3)

+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+
|               Email|             Address|Avg Session Length|Time on App|Time on Website|Length of Membership|Yearly Amount Spent|
+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+
|mstephenson@ferna...|835 Frank TunnelW...|       34.49726773|12.65565115|    39.57766802|         4.082620633|         587.951054|
|   hduke@hotmail.com|4547 Archer Commo...|       31.92627203|11.10946073|    37.26895887|         2.664034182|        392.2049334|
|    pallen@yahoo.com|24645 Valerie Uni...|       33.00091476|11.33027806|    37.11059744|         4.104543202|        487.5475049|
+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+
only showing top 3 rows



In [7]:
data.describe("Avg Session Length","Time on App","Time on Website","Length of Membership").show(6)

+-------+------------------+------------------+------------------+--------------------+
|summary|Avg Session Length|       Time on App|   Time on Website|Length of Membership|
+-------+------------------+------------------+------------------+--------------------+
|  count|               500|               500|               500|                 500|
|   mean|    33.05319351824|12.052487936928012|37.060445421080004|  3.5334615559300007|
| stddev|0.9925631111602911|0.9942156084624618|1.0104889068105993|  0.9992775024367542|
|    min|       29.53242897|       8.508152176|       33.91384725|          0.26990109|
|    max|       36.13966249|       15.12699429|       40.00518164|         6.922689335|
+-------+------------------+------------------+------------------+--------------------+



In [8]:
vec=VectorAssembler(inputCols=["Avg Session Length","Time on App","Time on Website","Length of Membership"]
                       ,outputCol="features")
vectrain=vec.transform(trainDF)

In [9]:
#For more information about parameters:https://spark.apache.org/docs/1.5.2/ml-linear-methods.html
lr=LinearRegression(featuresCol="features",labelCol="Yearly Amount Spent")

In [10]:
lrmodel=lr.fit(vectrain)

In [11]:
katsayı_1=lrmodel.coefficients
b=lrmodel.intercept
print("Features=",katsayı_1,b)

Features= [25.441099229567953,37.93266667507862,0.4630526022128153,61.30052161032559] -1032.861345404639


In [12]:
from pyspark.ml import Pipeline
pipeline=Pipeline(stages=[vec,lr])
pipelinemodel=pipeline.fit(trainDF)
testsonuc=pipelinemodel.transform(testDF)
testsonuc.select("Time on App","Time on Website","features","Yearly Amount Spent","prediction").show(10)

+-----------+---------------+--------------------+-------------------+------------------+
|Time on App|Time on Website|            features|Yearly Amount Spent|        prediction|
+-----------+---------------+--------------------+-------------------+------------------+
|10.16317906|    37.76304108|[33.7051128,10.16...|        521.2407802| 520.5901086144186|
|13.45772494|    37.23880567|[32.44952156,13.4...|        503.9783791| 500.7310862684501|
|12.00591637|    36.53409567|[33.45229528,12.0...|        576.4776072| 579.3978264746054|
|11.44890154|    37.58019043|[32.42569728,11.4...|        420.7376732|402.35549443636455|
|10.73536292|    37.45837473|[33.54774794,10.7...|        476.1914133| 482.0263748363857|
|10.97316208|    36.60950715|[32.84879283,10.9...|        404.8245289| 412.0344747781169|
|10.25654903|    36.14390846|[32.83694077,10.2...|        256.6705823| 256.7392917606485|
|11.48158715|    39.24096484|[31.96732095,11.4...|        445.7498412| 450.6654329440057|
|12.387184

In [15]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(labelCol="Yearly Amount Spent", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(testsonuc)
print ("Root Mean Square Error (RMSE):", rmse)
lr_evaluator = RegressionEvaluator(labelCol="Yearly Amount Spent",predictionCol="prediction",metricName="r2")
r2=lr_evaluator.evaluate(testsonuc)
print("R Squared (R2) on test data = ",r2)
#Result looks so good.

Root Mean Square Error (RMSE): 9.870628188825025
R Squared (R2) on test data =  0.986100570457219
