In [1]:
Objective is to examine a dataset with Ecommerce Customer Data for a company's website and mobile app. Then we want to see if we can build a regression model that will predict the customer's yearly spend on the company's product.

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('lr_example').getOrCreate()

In [3]:
from pyspark.ml.regression import LinearRegression

In [4]:
# Use Spark to read in the Ecommerce Customers csv file.
data = spark.read.csv("/FileStore/tables/Ecommerce_Customers.csv",inferSchema=True,header=True)

In [5]:
# Print the Schema of the DataFrame
data.printSchema()

In [6]:
data.show()

In [7]:
data.head()

In [8]:
for item in data.head():
    print(item)

In [9]:
## Setting Up DataFrame for Machine Learning 

In [10]:
# A few things we need to do before Spark can accept the data!
# It needs to be in the form of two columns
# ("label","features")

# Import VectorAssembler and Vectors
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [11]:
data.columns

In [12]:
assembler = VectorAssembler(
    inputCols=["Avg Session Length", "Time on App", 
               "Time on Website",'Length of Membership'],
    outputCol="features")

In [13]:
output = assembler.transform(data)

In [14]:
output.select("features").show()

In [15]:
output.show()

In [16]:
final_data = output.select("features",'Yearly Amount Spent')

In [17]:
train_data,test_data = final_data.randomSplit([0.7,0.3])

In [18]:
train_data.describe().show()

In [19]:
test_data.describe().show()

In [20]:
# Create a Linear Regression Model object
lr = LinearRegression(labelCol='Yearly Amount Spent')

In [21]:
# Fit the model to the data and call this model lrModel
lrModel = lr.fit(train_data,)

In [22]:
# Print the coefficients and intercept for linear regression
print("Coefficients: {} Intercept: {}".format(lrModel.coefficients,lrModel.intercept))

In [23]:
test_results = lrModel.evaluate(test_data)

In [24]:
# Interesting results....
test_results.residuals.show()

In [25]:
unlabeled_data = test_data.select('features')

In [26]:
predictions = lrModel.transform(unlabeled_data)

In [27]:
predictions.show()

In [28]:
print("RMSE: {}".format(test_results.rootMeanSquaredError))
print("MSE: {}".format(test_results.meanSquaredError))