In [1]:
import findspark
findspark.init()

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import LinearRegression
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [2]:
spark = SparkSession \
    .builder \
    .appName("Linear regression") \
    .getOrCreate()

sc = spark.sparkContext

In [7]:
ecomm_df = spark.read.option('header',True).option('inferSchema',True).csv('./dataset/EcommerceCustomers.csv')

In [8]:
ecomm_df.printSchema()

root
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [9]:
ecomm_df.show(5)

+------------------+-----------+---------------+--------------------+-------------------+
|Avg Session Length|Time on App|Time on Website|Length of Membership|Yearly Amount Spent|
+------------------+-----------+---------------+--------------------+-------------------+
|       34.49726773|12.65565115|    39.57766802|         4.082620633|         587.951054|
|       31.92627203|11.10946073|    37.26895887|         2.664034182|        392.2049334|
|       33.00091476|11.33027806|    37.11059744|         4.104543202|        487.5475049|
|       34.30555663|13.71751367|    36.72128268|         3.120178783|         581.852344|
|       33.33067252|12.79518855|     37.5366533|         4.446308318|         599.406092|
+------------------+-----------+---------------+--------------------+-------------------+
only showing top 5 rows



In [12]:
ignore = ['Yearly Amount Spent']
assembler = VectorAssembler(
    inputCols=[x for x in ecomm_df.columns if x not in ignore],
    outputCol='features')

assembler_df = assembler.transform(ecomm_df)

In [13]:
assembler_df.show(5)

+------------------+-----------+---------------+--------------------+-------------------+--------------------+
|Avg Session Length|Time on App|Time on Website|Length of Membership|Yearly Amount Spent|            features|
+------------------+-----------+---------------+--------------------+-------------------+--------------------+
|       34.49726773|12.65565115|    39.57766802|         4.082620633|         587.951054|[34.49726773,12.6...|
|       31.92627203|11.10946073|    37.26895887|         2.664034182|        392.2049334|[31.92627203,11.1...|
|       33.00091476|11.33027806|    37.11059744|         4.104543202|        487.5475049|[33.00091476,11.3...|
|       34.30555663|13.71751367|    36.72128268|         3.120178783|         581.852344|[34.30555663,13.7...|
|       33.33067252|12.79518855|     37.5366533|         4.446308318|         599.406092|[33.33067252,12.7...|
+------------------+-----------+---------------+--------------------+-------------------+--------------------+
o

In [14]:
assembler_df.printSchema()

root
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)
 |-- features: vector (nullable = true)



In [15]:
label_df = assembler_df.withColumnRenamed('Yearly Amount Spent','label').drop('Avg Session Length','Time on App','Time on Website','Length of Membership')

In [16]:
label_df.show(5)

+-----------+--------------------+
|      label|            features|
+-----------+--------------------+
| 587.951054|[34.49726773,12.6...|
|392.2049334|[31.92627203,11.1...|
|487.5475049|[33.00091476,11.3...|
| 581.852344|[34.30555663,13.7...|
| 599.406092|[33.33067252,12.7...|
+-----------+--------------------+
only showing top 5 rows



In [17]:
label_df.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



In [18]:
lr = LinearRegression(featuresCol = 'features', labelCol='label', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model = lr.fit(label_df)
print('Coefficients: ' + str(lr_model.coefficients))
print('Intercept: ' + str(lr_model.intercept))

Coefficients: [25.474157403886057,38.45929679241899,0.19719156059291088,61.3017744304752]
Intercept: -1030.1338971065438


In [19]:
lr_model_summary = lr_model.summary
print('RMSE: %f' % lr_model_summary.rootMeanSquaredError)
print('R-squared: %f' % lr_model_summary.r2)

RMSE: 9.936872
R-squared: 0.984272


In [20]:
lr_model_summary.residuals.show()

+-------------------+
|          residuals|
+-------------------+
| -5.507591981505698|
| 11.122757414957675|
|-17.677299844969582|
| 12.001539220539257|
|  8.406736193229563|
|-1.7071851050116038|
|   4.40754316973505|
| -8.210234143973935|
| 11.630460959496531|
|-14.248859785107982|
|-15.728731620118253|
|  8.940784765675176|
|   9.46480495209073|
|  12.31025724421579|
|   9.69839341574982|
|  9.919111699675284|
|   19.2070402531877|
| -3.955725156662538|
| -3.961465670905909|
|  9.754952329374873|
+-------------------+
only showing top 20 rows

