In [None]:
# -*- coding: utf-8 -*-
# Indentation: Jupyter Notebook

'''
Regression using SparkML
'''

__version__ = 1.0
__author__ = "Sourav Raj"
__author_email__ = "souravraj.iitbbs@gmail.com"


In [1]:
import findspark
findspark.init()
from pyspark import SparkContext
from pyspark.sql.session import SparkSession
sc =SparkContext()
spark=SparkSession(sc)

In [2]:
from pyspark.ml.regression import LinearRegression

In [5]:
pp_df=spark.read.csv('../data/CCPP/power_plant.csv', header=True, inferSchema=True)

In [6]:
pp_df.take(1)

[Row(AT=14.96, V=41.76, AP=1024.07, RH=73.17, PE=463.26)]

In [7]:
from pyspark.ml.feature import VectorAssembler

In [8]:
vect_assem=VectorAssembler(inputCols=['AT', 'V', 'AP', 'RH'], outputCol='features')

In [9]:
vpp_df=vect_assem.transform(pp_df)

In [10]:
vpp_df.take(1)

[Row(AT=14.96, V=41.76, AP=1024.07, RH=73.17, PE=463.26, features=DenseVector([14.96, 41.76, 1024.07, 73.17]))]

In [11]:
lr=LinearRegression(featuresCol='features', labelCol='PE')
lr_model=lr.fit(vpp_df)

In [12]:
lr_model.coefficients

DenseVector([-1.9775, -0.2339, 0.0621, -0.1581])

In [13]:
lr_model.intercept

454.6092744523414

In [15]:
lr_model.summary.rootMeanSquaredError

4.557126016749488

In [16]:
# to save the model
lr_model.save('../data/lr1.model')

# Decision Tree

In [17]:
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler

In [22]:
splits=vpp_df.randomSplit([0.7, 0.3], seed=1)
train_df=splits[0]
test_df=splits[1]

In [23]:
train_df.count()

6707

In [25]:
test_df.count()

2861

In [26]:
vpp_df.count()

9568

In [27]:
dt=DecisionTreeRegressor(featuresCol='features', labelCol='PE')

In [31]:
dt_model=dt.fit(train_df)
dt_pred=dt_model.transform(test_df)

In [32]:
dt_evaluator=RegressionEvaluator(labelCol='PE', predictionCol='prediction', metricName='rmse')

In [33]:
rmse=dt_evaluator.evaluate(dt_pred)

In [34]:
rmse

4.400189094399873

# Gradient Boosting 

In [35]:
from pyspark.ml.regression import GBTRegressor

In [36]:
gbt=GBTRegressor(featuresCol='features', labelCol='PE')

In [37]:
gbt_model=gbt.fit(train_df)
gbt_pred=gbt_model.transform(test_df)

In [38]:
gbt_evaluator = RegressionEvaluator(labelCol='PE', predictionCol='prediction', metricName='rmse')

In [39]:
gbt_rmse = gbt_evaluator.evaluate(gbt_pred)
gbt_rmse

3.9760971901288906