In [1]:
import findspark
findspark.init()
import pyspark

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName("Python Linear Regression example").getOrCreate()

In [4]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml import Pipeline
from pyspark.sql.functions import *

In [5]:
data = spark.read.load("linregdata1.csv", format="csv", sep=",", inferSchema="true", header="true")
data.printSchema()

root
 |-- temperature: double (nullable = true)
 |-- exhaust_vacuum: double (nullable = true)
 |-- ambient_pressure: double (nullable = true)
 |-- relative_humidity: double (nullable = true)
 |-- energy_output: double (nullable = true)



In [6]:
data.describe()

DataFrame[summary: string, temperature: string, exhaust_vacuum: string, ambient_pressure: string, relative_humidity: string, energy_output: string]

In [7]:
features = ["temperature", "exhaust_vacuum", "ambient_pressure", "relative_humidity"]

In [8]:
lr_data = data.select(col("energy_output").alias("label"), *features)
lr_data.printSchema()

root
 |-- label: double (nullable = true)
 |-- temperature: double (nullable = true)
 |-- exhaust_vacuum: double (nullable = true)
 |-- ambient_pressure: double (nullable = true)
 |-- relative_humidity: double (nullable = true)



In [9]:
lr_data.show()

+------+-----------+--------------+----------------+-----------------+
| label|temperature|exhaust_vacuum|ambient_pressure|relative_humidity|
+------+-----------+--------------+----------------+-----------------+
|480.48|       8.34|         40.77|         1010.84|            90.01|
|445.75|      23.64|         58.49|          1011.4|             74.2|
|438.76|      29.74|          56.9|         1007.15|            41.91|
|453.09|      19.07|         49.69|         1007.22|            76.79|
|464.43|       11.8|         40.66|         1017.13|             97.2|
|470.96|      13.97|         39.16|         1016.05|             84.6|
|442.35|       22.1|         71.29|          1008.2|            75.38|
| 464.0|      14.47|         41.76|         1021.98|            78.41|
|428.77|      31.25|         69.51|         1010.25|            36.83|
|484.31|       6.77|         38.18|          1017.8|            81.13|
|435.29|      28.28|         68.67|         1006.36|             69.9|
|451.4

In [10]:
(training, test) = lr_data.randomSplit([.7, .3])

VectorAssembler is a transformer that combines a given list of columns into a single vector column.

In [11]:
vectorAssembler = VectorAssembler(inputCols=features, outputCol="unscaled_features")

StandardScaler transforms a dataset of Vector rows, normalizing each feature to have unit standard deviation or zero mean.
Uses 'withStd' by default i.e. scales the data to unit standard deviation.

In [12]:
standardScaler = StandardScaler(inputCol="unscaled_features", outputCol="features")

In [13]:
lr = LinearRegression(maxIter=10, regParam=.02)

In [14]:
stages = [vectorAssembler, standardScaler, lr]
pipeline = Pipeline(stages=stages)

In [15]:
model = pipeline.fit(training)

In [16]:
prediction_df = model.transform(test)

In [17]:
prediction_df.show(truncate=False)

+------+-----------+--------------+----------------+-----------------+---------------------------+-----------------------------------------------------------------------------+------------------+
|label |temperature|exhaust_vacuum|ambient_pressure|relative_humidity|unscaled_features          |features                                                                     |prediction        |
+------+-----------+--------------+----------------+-----------------+---------------------------+-----------------------------------------------------------------------------+------------------+
|420.26|24.27      |63.87         |1018.88         |53.96            |[24.27,63.87,1018.88,53.96]|[3.246374445067361,4.987386051747315,172.45562976078398,3.696612465594044]   |446.5038791109836 |
|421.57|23.0       |66.05         |1020.61         |80.29            |[23.0,66.05,1020.61,80.29] |[3.0764982380119203,5.1576146660076745,172.74844956241532,5.5003894526046295]|444.43181614247106|
|425.17|32.66      |

In [18]:
prediction_df.select("label","prediction").show(truncate=False)

+------+------------------+
|label |prediction        |
+------+------------------+
|420.26|446.5038791109836 |
|421.57|444.43181614247106|
|425.17|429.4921076419873 |
|425.18|429.26610560028695|
|425.3 |429.17969254269883|
|425.5 |428.9646132277076 |
|425.61|427.57765737291896|
|425.66|430.89325073138076|
|425.68|428.89428648875537|
|425.91|430.2764705565717 |
|426.13|433.64392232138164|
|426.2 |432.99472383903185|
|426.25|431.4454751087658 |
|426.31|430.0567557724375 |
|426.35|432.5214213802506 |
|426.5 |429.99500150786275|
|426.52|431.5839763658618 |
|426.53|430.83381739647854|
|426.64|432.71105699600463|
|426.76|428.40737120675226|
+------+------------------+
only showing top 20 rows



In [19]:
from pyspark.ml.evaluation import RegressionEvaluator
eval = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")

In [20]:
rmse = eval.evaluate(prediction_df)
print("RMSE: %.3f" % rmse)

RMSE: 4.519


In [21]:
mse = eval.evaluate(prediction_df, {eval.metricName: "mse"})
print("MSE: %.3f" % mse)

MSE: 20.424


In [22]:
mae = eval.evaluate(prediction_df, {eval.metricName: "mae"})
print("MAE: %.3f" % mae)

MAE: 3.623


In [23]:
r2 = eval.evaluate(prediction_df, {eval.metricName: "r2"})
print("r2: %.3f" %r2)

r2: 0.930
