In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('lr_example').getOrCreate()

In [3]:
from pyspark.ml.regression import LinearRegression

In [4]:
data = spark.read.csv('/FileStore/tables/Ecommerce_Customers.csv', header=True, inferSchema=True)

In [5]:
data.show(4)

In [6]:
data.printSchema()

In [7]:
for item in data.head(2)[1]:
  print(item)

In [8]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [9]:
data.columns

In [10]:
# create an assembler object and tell is what to expect
assembler = VectorAssembler(inputCols=['Avg Session Length',
                                       'Time on App',
                                       'Time on Website',
                                       'Length of Membership',
                                       'Yearly Amount Spent'],
                             outputCol='features')

In [11]:
type(assembler)

In [12]:
output = assembler.transform(data)

In [13]:
type(output)

In [14]:
output.select('features').show(4)

In [15]:
# note that features is now a DenseVector of the values input when creating the assembler objcet
output.head(1)

In [16]:
final_data = output.select('features','Yearly Amount Spent')

In [17]:
final_data.show(4)

In [18]:
train_data,test_data = final_data.randomSplit([0.7,0.3])

In [19]:
train_data.describe().show(4)

In [20]:
test_data.describe().show(4)

In [21]:
lr = LinearRegression(labelCol='Yearly Amount Spent')

In [22]:
lr_model = lr.fit(train_data)

In [23]:
test_results = lr_model.evaluate(test_data)

In [24]:
test_results.residuals.show(10)

In [25]:
test_results.rootMeanSquaredError

In [26]:
test_results.r2

In [27]:
final_data.describe().show()

In [28]:
lr_model

In [29]:
lr_model.coefficients

In [30]:
lr_model.intercept

In [31]:
unlabeled_data = test_data.select('features')

In [32]:
unlabeled_data.show(4)

In [33]:
predictions = lr_model.transform(unlabeled_data)

In [34]:
predictions.show(4)