In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler

In [3]:
spark= SparkSession.builder.appName('solar_radiation').getOrCreate()

More info about Data: https://earthdata.nasa.gov/learn/articles/sorce-mission-ends
Data: https://www.kaggle.com/dronio/SolarEnergy

In [4]:
data = spark.read.csv('SolarPrediction.csv',header=True , inferSchema=True)

In [5]:
data.select("Radiation","Temperature","Pressure","Humidity","WindDirection(Degrees)","Speed").show(5)

+---------+-----------+--------+--------+----------------------+-----+
|Radiation|Temperature|Pressure|Humidity|WindDirection(Degrees)|Speed|
+---------+-----------+--------+--------+----------------------+-----+
|     1.21|         48|   30.46|      59|                177.39| 5.62|
|     1.21|         48|   30.46|      58|                176.78| 3.37|
|     1.23|         48|   30.46|      57|                158.75| 3.37|
|     1.21|         48|   30.46|      60|                137.71| 3.37|
|     1.17|         48|   30.46|      62|                104.95| 5.62|
+---------+-----------+--------+--------+----------------------+-----+
only showing top 5 rows



In [6]:
trainDF,testDF=data.randomSplit([0.8,0.2])

In [7]:
vektör=VectorAssembler(inputCols=["Temperature","Pressure","Humidity","WindDirection(Degrees)","Speed"]
                       ,outputCol="features")
vektörtrain=vektör.transform(trainDF)

In [8]:
lr=LinearRegression(featuresCol="features",labelCol="Radiation",maxIter=1000, regParam=0.5,elasticNetParam=1)

In [9]:
lrmodel=lr.fit(vektörtrain)

In [10]:
c_1=lrmodel.coefficients[0]
c_2=lrmodel.coefficients[1]
c_3=lrmodel.coefficients[2]
c_4=lrmodel.coefficients[3]
c_5=lrmodel.coefficients[4]
b=lrmodel.intercept
print("Radiation=",c_1,"*Temperature","+",c_2,"*Pressure","+",c_3,"*Humidity","+",c_4,"*WindDirection(Degrees)","+"
     ,c_5,"*Speed","+",b)

Radiation= 38.19829874683544 *Temperature + -724.3487913572842 *Pressure + -0.26064215626232173 *Humidity + -0.264572540737894 *WindDirection(Degrees) + 7.893527870556228 *Speed + 20300.135639332224


In [11]:
from pyspark.ml import Pipeline
pipeline=Pipeline(stages=[vektör,lr])
pipelinemodel=pipeline.fit(trainDF)
sonuc=pipelinemodel.transform(testDF)
sonuc.select("Temperature","Pressure","Humidity","WindDirection(Degrees)","Speed","features","Radiation"
             ,"prediction").show(10)

+-----------+--------+--------+----------------------+-----+--------------------+---------+------------------+
|Temperature|Pressure|Humidity|WindDirection(Degrees)|Speed|            features|Radiation|        prediction|
+-----------+--------+--------+----------------------+-----+--------------------+---------+------------------+
|         51|   30.43|     103|                 77.27|11.25|[51.0,30.43,103.0...|     2.58|247.82768064459742|
|         51|   30.43|     103|                 67.85|  4.5|[51.0,30.43,103.0...|     2.15|197.03864085209352|
|         51|    30.4|     103|                 87.85|  4.5|[51.0,30.4,103.0,...|     1.94|213.47765377805626|
|         51|    30.4|     103|                 78.56|  9.0|[51.0,30.4,103.0,...|     2.27|251.45640809901306|
|         49|    30.4|     102|                145.36| 7.87|[49.0,30.4,102.0,...|     2.03|148.72732054658263|
|         49|   30.41|     102|                133.75|  9.0|[49.0,30.41,102.0...|     2.34|153.47520632470332|
|

In [12]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(labelCol="Radiation", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(sonuc)
print ("Root Mean Square Error (RMSE):", rmse)

Root Mean Square Error (RMSE): 210.5354552193049


In [13]:
lr_evaluator = RegressionEvaluator(labelCol="Radiation",predictionCol="prediction",metricName="r2")
r2=lr_evaluator.evaluate(sonuc)
print("R Squared (R2) on test data = ",r2)

R Squared (R2) on test data =  0.5648505042418364
