In [None]:
import findspark
findspark.init()
import pyspark

In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.appName("Python Linear Regression example").getOrCreate()

In [None]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml import Pipeline
from pyspark.sql.functions import *

In [None]:
data = spark.read.load("linregdata1.csv", format="csv", sep=",", inferSchema="true", header="true")
data.printSchema()

In [None]:
data.describe()

In [None]:
features = ["temperature", "exhaust_vacuum", "ambient_pressure", "relative_humidity"]

In [None]:
lr_data = data.select(col("energy_output").alias("label"), *features)
lr_data.printSchema()

In [None]:
lr_data.show()

In [None]:
(training, test) = lr_data.randomSplit([.7, .3])

VectorAssembler is a transformer that combines a given list of columns into a single vector column.

In [None]:
vectorAssembler = VectorAssembler(inputCols=features, outputCol="unscaled_features")

StandardScaler transforms a dataset of Vector rows, normalizing each feature to have unit standard deviation or zero mean.
Uses 'withStd' by default i.e. scales the data to unit standard deviation.

In [None]:
standardScaler = StandardScaler(inputCol="unscaled_features", outputCol="features")

In [None]:
lr = LinearRegression(maxIter=10, regParam=.02)

In [None]:
stages = [vectorAssembler, standardScaler, lr]
pipeline = Pipeline(stages=stages)

In [None]:
model = pipeline.fit(training)

In [None]:
prediction_df = model.transform(test)

In [None]:
prediction_df.show(truncate=False)

In [None]:
prediction_df.select("label","prediction").show(truncate=False)

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
eval = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")

In [None]:
rmse = eval.evaluate(prediction_df)
print("RMSE: %.3f" % rmse)

In [None]:
mse = eval.evaluate(prediction_df, {eval.metricName: "mse"})
print("MSE: %.3f" % mse)

In [None]:
mae = eval.evaluate(prediction_df, {eval.metricName: "mae"})
print("MAE: %.3f" % mae)

In [None]:
r2 = eval.evaluate(prediction_df, {eval.metricName: "r2"})
print("r2: %.3f" %r2)