In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('lr_example').getOrCreate()
from pyspark.ml.regression import LinearRegression
data = spark.read.format("csv").load("dbfs:/FileStore/shared_uploads/sejal@ibm.com/Ecommerce_Customers.csv", inferSchema=True, header=True)

In [0]:
data.printSchema()

In [0]:
# inspect a row of data
for col, item in zip(data.columns, data.head(1)[0]):
  print(col, ": ", item)


In [0]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [0]:
data.columns

In [0]:
assembler = VectorAssembler(inputCols=['Avg Session Length','Time on App',
 'Time on Website','Length of Membership'],outputCol='features')

In [0]:
output = assembler.transform(data)

In [0]:
output.printSchema()

In [0]:
output.select('features').show()

In [0]:
# what does our features vector look like?
output.head(1)[0][-1]

In [0]:
final_data = output.select('features', 'Yearly Amount Spent')

In [0]:
# train test split
train_data, test_data = final_data.randomSplit([0.7, 0.3])

In [0]:
train_data.describe().show()

In [0]:
test_data.describe().show()

In [0]:
lr = LinearRegression(labelCol='Yearly Amount Spent')

In [0]:
lr_model = lr.fit(train_data)

In [0]:
test_results = lr_model.evaluate(test_data)

In [0]:
test_results.residuals.show()

In [0]:
test_results.rootMeanSquaredError

In [0]:
test_results.r2

In [0]:
unlabeled_data = test_data.select('features')
unlabeled_data.show()

In [0]:
predictions = lr_model.transform(unlabeled_data)
predictions.show()