In [1]:
import os
import sys
from pyspark.ml.regression import LinearRegression
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable




In [2]:
spark= SparkSession.builder.getOrCreate()

In [3]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

In [4]:
df= spark.read.csv("Ecommerce_Customers.csv", inferSchema= True, header= True)

In [5]:
df.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [6]:
df.head()

Row(Email='mstephenson@fernandez.com', Address='835 Frank TunnelWrightmouth, MI 82180-9605', Avatar='Violet', Avg Session Length=34.49726772511229, Time on App=12.65565114916675, Time on Website=39.57766801952616, Length of Membership=4.0826206329529615, Yearly Amount Spent=587.9510539684005)

In [7]:
df.show(1)

+--------------------+--------------------+------+------------------+-----------------+-----------------+--------------------+-------------------+
|               Email|             Address|Avatar|Avg Session Length|      Time on App|  Time on Website|Length of Membership|Yearly Amount Spent|
+--------------------+--------------------+------+------------------+-----------------+-----------------+--------------------+-------------------+
|mstephenson@ferna...|835 Frank TunnelW...|Violet| 34.49726772511229|12.65565114916675|39.57766801952616|  4.0826206329529615|  587.9510539684005|
+--------------------+--------------------+------+------------------+-----------------+-----------------+--------------------+-------------------+
only showing top 1 row



In [8]:
df.columns

['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

In [9]:
assembler= VectorAssembler(inputCols=['Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership'], outputCol= 'features')

In [10]:
df1= df.select('Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent')

In [11]:
output= assembler.transform(df1)

In [12]:
final= output.select('features', 'Yearly Amount Spent')

In [13]:
train_data, test_data= final.randomSplit([0.7, 0.3])

In [14]:
train_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                350|
|   mean| 496.16176891550765|
| stddev|  80.01198088367885|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



In [15]:
test_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                150|
|   mean|  506.6693333924511|
| stddev|   77.4277950390187|
|    min|  275.9184206503857|
|    max|  689.2356997616951|
+-------+-------------------+



In [16]:
lr= LinearRegression(labelCol= 'Yearly Amount Spent')

In [17]:
model= lr.fit(train_data)

In [18]:
results= model.evaluate(train_data)

In [19]:
print('Mean Squared Error:', results.meanSquaredError)

Mean Squared Error: 99.28493836501569


In [20]:
print('Rsquared Error:', results.r2)

Rsquared Error: 0.9844469365803752


In [21]:
unlabeled_data= test_data.select('features')

In [22]:
predictions= model.transform(unlabeled_data)
predictions.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[30.8364326747734...|  471.859078330456|
|[30.8794843441274...| 494.2230903106399|
|[31.0472221394875...|388.62736461935356|
|[31.0662181616375...| 461.7791617654698|
|[31.1280900496166...| 564.4130940499624|
|[31.2834474760581...| 568.9744486279051|
|[31.3091926408918...| 429.9374478655541|
|[31.3895854806643...| 409.8545990065745|
|[31.4252268808548...| 534.7385438968865|
|[31.5171218025062...| 281.6828402211472|
|[31.5316044825729...|  433.917595331563|
|[31.5741380228732...| 558.3592559337765|
|[31.6005122003032...| 461.2319177534339|
|[31.6548096756927...|469.15579963835376|
|[31.7656188210424...|501.38258117469627|
|[31.8093003166791...| 547.2201696104312|
|[31.8164283341993...| 519.2311609273252|
|[31.8209982016720...|417.29563626060644|
|[31.8279790554652...| 449.7741676485025|
|[31.9048571310136...|491.12065462318765|
+--------------------+------------

In [23]:
actual =test_data.select('features','Yearly Amount Spent')
actual.show()

+--------------------+-------------------+
|            features|Yearly Amount Spent|
+--------------------+-------------------+
|[30.8364326747734...|  467.5019004269896|
|[30.8794843441274...|  490.2065999848547|
|[31.0472221394875...|  392.4973991890214|
|[31.0662181616375...| 448.93329320767435|
|[31.1280900496166...|  557.2526867470547|
|[31.2834474760581...|  591.7810894256675|
|[31.3091926408918...|  432.7207178399336|
|[31.3895854806643...|  410.0696110599829|
|[31.4252268808548...|  530.7667186547619|
|[31.5171218025062...|  275.9184206503857|
|[31.5316044825729...| 436.51560572936256|
|[31.5741380228732...|  544.4092721605869|
|[31.6005122003032...| 479.17285149109694|
|[31.6548096756927...|  475.2634237275485|
|[31.7656188210424...| 496.55408163560713|
|[31.8093003166791...|  536.7718993628412|
|[31.8164283341993...| 501.12249150365636|
|[31.8209982016720...| 424.67528101321335|
|[31.8279790554652...|  440.0027475469415|
|[31.9048571310136...| 473.94985742281614|
+----------

In [24]:
test_results= model.evaluate(test_data)

In [25]:
print('Mean Squared Error:', test_results.meanSquaredError)

Mean Squared Error: 97.50205785142307


In [26]:
print('Rsquared Error:', test_results.r2)

Rsquared Error: 0.9836271233848314


In [27]:
print("Coefficients: " + str(model.coefficients))
print("Intercept: " + str(model.intercept))

Coefficients: [25.6160494322268,38.73862767335686,0.5979802991602282,61.10893365915443]
Intercept: -1052.425466136453
