In [1]:
from pyspark.sql import SparkSession

In [2]:
spark =SparkSession.builder.appName('Linear Regression').getOrCreate()

In [3]:
from pyspark.ml.regression import LinearRegression

In [4]:
data = spark.read.csv('Ecommerce_Customers-Copy1.csv',inferSchema=True,header=True)

In [5]:
data.take(1)[0].asDict()

{'Email': 'mstephenson@fernandez.com',
 'Address': '835 Frank TunnelWrightmouth, MI 82180-9605',
 'Avatar': 'Violet',
 'Avg Session Length': 34.49726772511229,
 'Time on App': 12.65565114916675,
 'Time on Website': 39.57766801952616,
 'Length of Membership': 4.0826206329529615,
 'Yearly Amount Spent': 587.9510539684005}

In [6]:
data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [7]:
clear()

[H[2J

In [8]:
data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [9]:
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler,StringIndexer

In [10]:
data.columns

['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

In [11]:
assembler = VectorAssembler(inputCols=['Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership'],outputCol='features')

In [12]:
output = assembler.transform(data)

In [13]:
output.take(1)

[Row(Email='mstephenson@fernandez.com', Address='835 Frank TunnelWrightmouth, MI 82180-9605', Avatar='Violet', Avg Session Length=34.49726772511229, Time on App=12.65565114916675, Time on Website=39.57766801952616, Length of Membership=4.0826206329529615, Yearly Amount Spent=587.9510539684005, features=DenseVector([34.4973, 12.6557, 39.5777, 4.0826]))]

In [14]:
final = output.select(['Yearly Amount Spent','features'])

In [15]:
final.show()

+-------------------+--------------------+
|Yearly Amount Spent|            features|
+-------------------+--------------------+
|  587.9510539684005|[34.4972677251122...|
|  392.2049334443264|[31.9262720263601...|
| 487.54750486747207|[33.0009147556426...|
|  581.8523440352177|[34.3055566297555...|
|  599.4060920457634|[33.3306725236463...|
|   637.102447915074|[33.8710378793419...|
|  521.5721747578274|[32.0215955013870...|
|  549.9041461052942|[32.7391429383803...|
|  570.2004089636196|[33.9877728956856...|
|  427.1993848953282|[31.9365486184489...|
|  492.6060127179966|[33.9925727749537...|
|  522.3374046069357|[33.8793608248049...|
|  408.6403510726275|[29.5324289670579...|
|  573.4158673313865|[33.1903340437226...|
|  470.4527333009554|[32.3879758531538...|
|  461.7807421962299|[30.7377203726281...|
| 457.84769594494855|[32.1253868972878...|
| 407.70454754954415|[32.3388993230671...|
|  452.3156754800354|[32.1878120459321...|
|   605.061038804892|[32.6178560628234...|
+----------

In [21]:
train_data,test_data = output.randomSplit([0.7,0.3])

In [22]:
regressor = LinearRegression(featuresCol='features',labelCol='Yearly Amount Spent',predictionCol='predicted YAS')

In [23]:
model = regressor.fit(train_data)

In [24]:
evaluation = model.evaluate(test_data)

In [26]:
evaluation.rootMeanSquaredError

10.389511365416872

In [27]:
evaluation.r2

0.9834206374411426

In [30]:
prediction = model.transform(test_data.select('features'))

In [31]:
prediction.show()

+--------------------+------------------+
|            features|     predicted YAS|
+--------------------+------------------+
|[33.7051127975019...| 519.6453983945473|
|[32.8487928288471...| 411.2233654507427|
|[33.5030872567197...| 422.2219829462108|
|[32.8369407670213...|255.27313397572448|
|[32.8871046456153...| 683.9116949960389|
|[31.9549038566348...| 432.0063989228629|
|[34.1881840610182...| 584.1818539610333|
|[32.0637746203136...| 390.5686666227307|
|[32.0961089938451...| 375.7065626270446|
|[32.9597643110742...| 461.3276341732701|
|[32.6572685947781...| 520.1228895057668|
|[33.8117334062172...| 531.4224568938998|
|[32.0542618511847...| 557.3430031858884|
|[33.3568743447574...| 549.4832016494186|
|[33.4716005269517...| 522.2555677822008|
|[34.3366772231470...|414.30871805191464|
|[33.9140151173223...| 497.8545087719806|
|[33.7081534080930...| 608.9544959084074|
|[33.5873733902455...|  412.419325768049|
|[32.1755012379493...| 576.8075380442797|
+--------------------+------------