In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler

In [2]:
spark= SparkSession.builder.appName('radiation_prediction').getOrCreate()


More info about Data: https://earthdata.nasa.gov/learn/articles/sorce-mission-ends 

Data: https://www.kaggle.com/dronio/SolarEnergy

I will predict Radiation according to Temperature,Pressure,Humidity,WindDirection(Degrees) Speed with Linear Regression model.



In [3]:
data = spark.read.csv('SolarPrediction.csv',header=True , inferSchema=True)


In [4]:
data.select("Radiation","Temperature","Pressure","Humidity","WindDirection(Degrees)","Speed").show(5)


+---------+-----------+--------+--------+----------------------+-----+
|Radiation|Temperature|Pressure|Humidity|WindDirection(Degrees)|Speed|
+---------+-----------+--------+--------+----------------------+-----+
|     1.21|         48|   30.46|      59|                177.39| 5.62|
|     1.21|         48|   30.46|      58|                176.78| 3.37|
|     1.23|         48|   30.46|      57|                158.75| 3.37|
|     1.21|         48|   30.46|      60|                137.71| 3.37|
|     1.17|         48|   30.46|      62|                104.95| 5.62|
+---------+-----------+--------+--------+----------------------+-----+
only showing top 5 rows



In [5]:
data.describe("Radiation","Temperature","Pressure","Humidity","WindDirection(Degrees)","Speed").show(5)
print("as we can see min radiation is 1.11")

+-------+------------------+-----------------+------------------+------------------+----------------------+------------------+
|summary|         Radiation|      Temperature|          Pressure|          Humidity|WindDirection(Degrees)|             Speed|
+-------+------------------+-----------------+------------------+------------------+----------------------+------------------+
|  count|             32686|            32686|             32686|             32686|                 32686|             32686|
|   mean|207.12469742399261|51.10325521630055| 30.42287890839873| 75.01630667564095|    143.48982071835067| 6.243869240652154|
| stddev|315.91638719511076|6.201157357040149|0.0546731550638987|25.990218508443192|     83.16749964761821|3.4904735500653965|
|    min|              1.11|               34|             30.19|                 8|                  0.09|               0.0|
|    max|           1601.26|               71|             30.56|               103|                359.95|    

In [6]:
#Divide data for training and testing.
trainDF,testDF=data.randomSplit([0.8,0.2])

In [7]:
#we will use features and label as a vector. 
vec=VectorAssembler(inputCols=["Temperature","Pressure","Humidity","WindDirection(Degrees)","Speed"]
                       ,outputCol="features")
vectrain=vec.transform(trainDF)

# Linear Regression

In [8]:
#For more information about parameters:https://spark.apache.org/docs/1.5.2/ml-linear-methods.html .
lr=LinearRegression(featuresCol="features",labelCol="Radiation",maxIter=100, regParam=0.3,elasticNetParam=1)


In [9]:
lrmodel=lr.fit(vectrain)


In [10]:
c_1=lrmodel.coefficients[0]
c_2=lrmodel.coefficients[1]
c_3=lrmodel.coefficients[2]
c_4=lrmodel.coefficients[3]
c_5=lrmodel.coefficients[4]
b=lrmodel.intercept
print("Radiation=",c_1,"*Temperature","+",c_2,"*Pressure","+",c_3,"*Humidity","+",c_4,"*WindDirection(Degrees)","+"
     ,c_5,"*Speed","+",b)

Radiation= 38.32914558204896 *Temperature + -732.0532065309934 *Pressure + -0.2722972083072913 *Humidity + -0.2660627588181636 *WindDirection(Degrees) + 7.907915119819306 *Speed + 20529.084373526865


In [11]:
from pyspark.ml import Pipeline
pipeline=Pipeline(stages=[vec,lr])
pipelinemodel=pipeline.fit(trainDF)
sonuc=pipelinemodel.transform(testDF)
sonuc.select("Temperature","Pressure","Humidity","WindDirection(Degrees)","Speed","features","Radiation"
             ,"prediction").show(10)

+-----------+--------+--------+----------------------+-----+--------------------+---------+------------------+
|Temperature|Pressure|Humidity|WindDirection(Degrees)|Speed|            features|Radiation|        prediction|
+-----------+--------+--------+----------------------+-----+--------------------+---------+------------------+
|         52|   30.41|     103|                270.48| 5.62|[52.0,30.41,103.0...|      3.9|204.89314869849477|
|         51|   30.41|     103|                170.97| 5.62|[51.0,30.41,103.0...|     3.13| 193.0399082464428|
|         51|   30.41|     103|                131.57|12.37|[51.0,30.41,103.0...|     2.57| 256.9012080026587|
|         51|    30.4|     103|                105.55| 5.62|[51.0,30.4,103.0,...|     2.52| 217.7662659936359|
|         51|    30.4|     103|                117.72|11.25|[51.0,30.4,103.0,...|     2.74| 259.0498443434044|
|         51|    30.4|     103|                119.72|12.37|[51.0,30.4,103.0,...|      2.3| 267.3745837599672|
|

In [12]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(labelCol="Radiation", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(sonuc)
print ("Root Mean Square Error (RMSE):", rmse)
lr_evaluator = RegressionEvaluator(labelCol="Radiation",predictionCol="prediction",metricName="r2")
r2=lr_evaluator.evaluate(sonuc)
print("R Squared (R2) on test data = ",r2)
#Result doesnt look very good.

Root Mean Square Error (RMSE): 206.46128044293442
R Squared (R2) on test data =  0.5681659836598085


In [13]:
from pyspark.sql.functions import corr


In [14]:
data.select(corr('Temperature','Radiation')).show()


+----------------------------+
|corr(Temperature, Radiation)|
+----------------------------+
|           0.734954755435727|
+----------------------------+

