In [1]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('lr_ecomm').getOrCreate()

In [4]:
from pyspark.ml.regression import LinearRegression

In [9]:
data = spark.read.csv('Ecommerce_Customers.csv', inferSchema=True, header=True)

In [10]:
data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [11]:
# trying to get how much time has this customer been a member and their yearly 
#  amount spent on the service or the site

from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler



In [14]:
# only need numerical data to create assembler
# idea of assemblers is to create a single vector w/ the input and output column
assembler = VectorAssembler(inputCols=['Avg Session Length', 'Time on App', 
                                       'Time on Website', 'Length of Membership'],
                           outputCol='features')

# transform the data and adds extra vector column called features
output = assembler.transform(data)
output.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)
 |-- features: vector (nullable = true)



In [15]:
output.select('features').show()

+--------------------+
|            features|
+--------------------+
|[34.4972677251122...|
|[31.9262720263601...|
|[33.0009147556426...|
|[34.3055566297555...|
|[33.3306725236463...|
|[33.8710378793419...|
|[32.0215955013870...|
|[32.7391429383803...|
|[33.9877728956856...|
|[31.9365486184489...|
|[33.9925727749537...|
|[33.8793608248049...|
|[29.5324289670579...|
|[33.1903340437226...|
|[32.3879758531538...|
|[30.7377203726281...|
|[32.1253868972878...|
|[32.3388993230671...|
|[32.1878120459321...|
|[32.6178560628234...|
+--------------------+
only showing top 20 rows



In [16]:
final_data = output.select('features', 'Yearly Amount Spent')

In [17]:
final_data.show()

+--------------------+-------------------+
|            features|Yearly Amount Spent|
+--------------------+-------------------+
|[34.4972677251122...|  587.9510539684005|
|[31.9262720263601...|  392.2049334443264|
|[33.0009147556426...| 487.54750486747207|
|[34.3055566297555...|  581.8523440352177|
|[33.3306725236463...|  599.4060920457634|
|[33.8710378793419...|   637.102447915074|
|[32.0215955013870...|  521.5721747578274|
|[32.7391429383803...|  549.9041461052942|
|[33.9877728956856...|  570.2004089636196|
|[31.9365486184489...|  427.1993848953282|
|[33.9925727749537...|  492.6060127179966|
|[33.8793608248049...|  522.3374046069357|
|[29.5324289670579...|  408.6403510726275|
|[33.1903340437226...|  573.4158673313865|
|[32.3879758531538...|  470.4527333009554|
|[30.7377203726281...|  461.7807421962299|
|[32.1253868972878...| 457.84769594494855|
|[32.3388993230671...| 407.70454754954415|
|[32.1878120459321...|  452.3156754800354|
|[32.6178560628234...|   605.061038804892|
+----------

In [18]:
# splitting up into train test data
train_data, test_data = final_data.randomSplit([0.7, 0.3])

In [19]:
train_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                349|
|   mean| 497.46432462727466|
| stddev|  77.02310424923327|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



In [20]:
test_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                151|
|   mean| 503.58920420116937|
| stddev|  84.48440276236391|
|    min|   266.086340948469|
|    max|  725.5848140556806|
+-------+-------------------+



In [22]:
# creating our linear regression model
lr_ship = LinearRegression(featuresCol='features', labelCol='Yearly Amount Spent', predictionCol='prediction')

# training the model
trained_ship_model = lr_ship.fit(train_data)

In [23]:
test_ship_results = trained_ship_model.evaluate(test_data)

In [25]:
# shows the difference b/w predicted and actual value
test_results.residuals.show()

+-------------------+
|          residuals|
+-------------------+
|-13.965412546968878|
|-19.000592293491195|
| -5.775124470128901|
|  5.113060509931529|
| 3.5176016124036096|
| -7.576197033889287|
|-14.434246668145022|
| 21.012364136653787|
| -4.507949428547363|
| -6.769404598573715|
| -18.36732762203883|
|-27.214872664397376|
|-3.2523536634548122|
| -4.965314319066522|
|-12.326405808802292|
| -5.798713133040565|
| -7.298758806807825|
|-10.135611307428178|
| -5.751445427730573|
| -9.877158773540714|
+-------------------+
only showing top 20 rows



In [29]:
# evaluation metrics for the linear regression model

# value of the difference b/w test value, true value versus the predicted value
# this equals the average amount spent yearly
test_results.rootMeanSquaredError

9.617817534960025

In [28]:
test_results.r2

0.9869537505319181

#### The r-squared value tells us that the linear regression model explains 98% of the variance in the data which is pretty darn good.

#### Compare the evaluation metrics with the final data

In [30]:
final_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                500|
|   mean|  499.3140382585909|
| stddev|   79.3147815497068|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



#### Now, compare the root mean squared error to the value you're actually trying to predict, which in this case is Yearl Amount Spent.

### Now, we will be deploying a model with unlabelled data and only features.

In [31]:
unlabeled_data = test_data.select('features')

In [32]:
predictions = lr_model.transform(unlabeled_data)

# predicts the Yearly Amount Spent from the features of each customer
predictions.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[30.3931845423455...| 333.8942823501625|
|[30.8162006488763...| 285.0869332419602|
|[30.8364326747734...| 473.2770248971185|
|[30.9716756438877...| 489.5255492469612|
|[31.0472221394875...| 388.9797975766178|
|[31.0613251567161...| 495.1316550917909|
|[31.0662181616375...| 463.3675398758194|
|[31.2834474760581...| 570.7687252890137|
|[31.4252268808548...| 535.2746680833093|
|[31.5257524169682...| 450.7350314084556|
|[31.5702008293202...| 564.3128197634437|
|[31.6739155032749...| 502.9399405742786|
|[31.7216523605090...|351.02928029532745|
|[31.7656188210424...|501.51939595467366|
|[31.8093003166791...| 549.0983051716435|
|[31.8124825597242...| 398.6090581168378|
|[31.8745516945853...| 399.5840030530753|
|[31.8854062999117...| 400.2388842799037|
|[31.9453957483445...| 662.7713693653825|
|[32.0085045178551...| 453.0743798022961|
+--------------------+------------

## We can conclude that the linear regression model fits this data very well as the r2 value is 98%.