In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('lr_example').getOrCreate()

import warnings
warnings.filterwarnings("ignore")

In [2]:
from pyspark.ml.regression import LinearRegression

In [3]:
# Use Spark to read in the Ecommerce Customers csv file.
data = spark.read.csv("Ecommerce_Customers.csv",inferSchema=True,header=True)

In [4]:
# Print the Schema of the DataFrame
data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [5]:
data.show()

+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|               Email|             Address|          Avatar|Avg Session Length|       Time on App|   Time on Website|Length of Membership|Yearly Amount Spent|
+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|mstephenson@ferna...|835 Frank TunnelW...|          Violet| 34.49726772511229| 12.65565114916675| 39.57766801952616|  4.0826206329529615|  587.9510539684005|
|   hduke@hotmail.com|4547 Archer Commo...|       DarkGreen| 31.92627202636016|11.109460728682564|37.268958868297744|    2.66403418213262|  392.2049334443264|
|    pallen@yahoo.com|24645 Valerie Uni...|          Bisque|33.000914755642675|11.330278057777512|37.110597442120856|   4.104543202376424| 487.54750486747207|
|riverarebecca@gma...|1414 David Throug...|   

In [6]:
for item in data.head():
    print(item)

mstephenson@fernandez.com
835 Frank TunnelWrightmouth, MI 82180-9605
Violet
34.49726772511229
12.65565114916675
39.57766801952616
4.0826206329529615
587.9510539684005


In [8]:
# A few things we need to do before Spark can accept the data!
# It needs to be in the form of two columns
# ("label","features")

# Import VectorAssembler and Vectors
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [9]:
data.columns

['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

In [11]:
assemblr = VectorAssembler(inputCols=['Avg Session Length','Time on App','Time on Website','Length of Membership'],
                          outputCol = 'features')

In [12]:
output = assemblr.transform(data)

In [14]:
output.select('features').show()

+--------------------+
|            features|
+--------------------+
|[34.4972677251122...|
|[31.9262720263601...|
|[33.0009147556426...|
|[34.3055566297555...|
|[33.3306725236463...|
|[33.8710378793419...|
|[32.0215955013870...|
|[32.7391429383803...|
|[33.9877728956856...|
|[31.9365486184489...|
|[33.9925727749537...|
|[33.8793608248049...|
|[29.5324289670579...|
|[33.1903340437226...|
|[32.3879758531538...|
|[30.7377203726281...|
|[32.1253868972878...|
|[32.3388993230671...|
|[32.1878120459321...|
|[32.6178560628234...|
+--------------------+
only showing top 20 rows



In [15]:
final_data = output.select('features','Yearly Amount Spent')

In [18]:
train, test = final_data.randomSplit([0.7,0.3])

In [17]:
lr = LinearRegression(featuresCol='features', labelCol='Yearly Amount Spent')

In [19]:
train.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                349|
|   mean|  500.5988918388564|
| stddev|  79.23225901257065|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



In [20]:
lrModel = lr.fit(train)

In [25]:
print("Cofficients:{} Intercept:{}".format(lrModel.coefficients, lrModel.intercept))

Cofficients:[25.379062979153808,38.58183344064345,0.19280857636798784,61.911960504925084] Intercept:-1030.7552220545654


In [27]:
modelEvaluation = lrModel.evaluate(test)
modelEvaluation.residuals.show()
print("RMSE:{}".format(modelEvaluation.rootMeanSquaredError))

+-------------------+
|          residuals|
+-------------------+
|-12.062296703108245|
| 10.024787934233132|
|  -16.7616938125185|
| -4.816255675246055|
| 5.7438208671003395|
| -7.938286285602771|
|-0.6383355482367961|
| -4.458508050960745|
| 3.5893069449049335|
|  3.309672074102423|
|0.38073032975057686|
| -8.224582332666046|
|-18.193915890708922|
|  17.85367194643254|
| 17.028160608990504|
| -4.006238820640704|
|-1.2473104180479595|
|-2.9041999444149837|
| 0.9947954644189849|
| 10.540998748085599|
+-------------------+
only showing top 20 rows

RMSE:9.603199660320447


In [28]:
unlabeled_data = test.select('features')
predictions = lrModel.transform(unlabeled_data)
predictions.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[30.3931845423455...|331.99116650630185|
|[30.7377203726281...|451.75595426199675|
|[30.8162006488763...| 282.8480347609875|
|[30.8364326747734...|472.31815610223566|
|[30.9716756438877...| 488.8947888897924|
|[31.1280900496166...| 565.1909730326574|
|[31.2606468698795...|421.96496680518817|
|[31.2681042107507...|427.92904122478467|
|[31.3584771924370...| 491.5866435045705|
|[31.3662121671876...| 427.2792104823825|
|[31.3895854806643...| 409.6888807302323|
|[31.4474464941278...|426.82732442789006|
|[31.5702008293202...| 564.1394080321138|
|[31.6005122003032...| 461.3191795446644|
|[31.6098395733896...|427.51738904211766|
|[31.6253601348306...| 380.3431395775649|
|[31.6610498227460...| 417.6056639979488|
|[31.8186165667690...|449.32287331455063|
|[31.8293464559211...|  384.157542523556|
|[31.9096268275227...| 552.9050369251536|
+--------------------+------------

In [29]:
print("MSE:{}".format(modelEvaluation.meanSquaredError))
print("RMSE:{}".format(modelEvaluation.rootMeanSquaredError))

MSE:92.22144371597876
RMSE:9.603199660320447


In [31]:
summary = lrModel.summary

In [32]:
summary.meanSquaredError

101.72869799302147