In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler

In [3]:
spark= SparkSession.builder.appName('possible_Asteroid_impacts').getOrCreate()

More info about Data: https://cneos.jpl.nasa.gov/sentry/intro.html

In [4]:
data = spark.read.csv('impacts.csv',header=True , inferSchema=True)

In [5]:
data.select("Asteroid Magnitude","Asteroid Diameter (km)","Maximum Palermo Scale").show(5)

+------------------+----------------------+---------------------+
|Asteroid Magnitude|Asteroid Diameter (km)|Maximum Palermo Scale|
+------------------+----------------------+---------------------+
|              28.3|                 0.007|                -8.31|
|              31.4|                 0.002|                -6.96|
|              31.4|                 0.002|                -6.87|
|              26.7|                 0.016|                -6.95|
|              19.2|                 0.497|                 -4.3|
+------------------+----------------------+---------------------+
only showing top 5 rows



In [6]:
trainDF,testDF=data.randomSplit([0.8,0.2])

In [7]:
vektör=VectorAssembler(inputCols=["Asteroid Magnitude","Asteroid Diameter (km)"]
                       ,outputCol="features")
vektörtrain=vektör.transform(trainDF)

In [8]:
lr=LinearRegression(featuresCol="features",labelCol="Maximum Palermo Scale",maxIter=10, regParam=0.5,elasticNetParam=1)

In [9]:
lrmodel=lr.fit(vektörtrain)

In [11]:
c_1=lrmodel.coefficients[0]
c_2=lrmodel.coefficients[1]
b=lrmodel.intercept
print("Features=",c_1,"*Asteroid Magnitude","+",c_2,"*Asteroid Diameter (km)","+",b)

Features= -0.051472315501410563 *Asteroid Magnitude + 0.0 *Asteroid Diameter (km) + -5.464962828682314


In [12]:
from pyspark.ml import Pipeline
pipeline=Pipeline(stages=[vektör,lr])
pipelinemodel=pipeline.fit(trainDF)
sonuc=pipelinemodel.transform(testDF)
sonuc.select("Asteroid Velocity","Asteroid Magnitude","Asteroid Diameter (km)","features","Maximum Palermo Scale","prediction").show(10)

+-----------------+------------------+----------------------+------------+---------------------+-------------------+
|Asteroid Velocity|Asteroid Magnitude|Asteroid Diameter (km)|    features|Maximum Palermo Scale|         prediction|
+-----------------+------------------+----------------------+------------+---------------------+-------------------+
|            14.87|              24.2|                 0.048|[24.2,0.048]|                -3.83|  -6.71059286381645|
|             8.09|              27.5|                 0.011|[27.5,0.011]|                -5.81| -6.880451504971104|
|            25.43|              25.5|                 0.028|[25.5,0.028]|                -4.75| -6.777506873968283|
|             6.19|              25.0|                 0.034|[25.0,0.034]|                -6.48| -6.751770716217578|
|             2.11|              29.0|                 0.005|[29.0,0.005]|                -5.86|  -6.95765997822322|
|            30.68|              18.5|                 0.679|[18

In [13]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(labelCol="Maximum Palermo Scale", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(sonuc)
print ("Root Mean Square Error (RMSE):", rmse)

Root Mean Square Error (RMSE): 1.4146404746524983


In [14]:
lr_evaluator = RegressionEvaluator(labelCol="Maximum Palermo Scale",predictionCol="prediction",metricName="r2")
r2=lr_evaluator.evaluate(sonuc)
print("R Squared (R2) on test data = ",r2)

R Squared (R2) on test data =  0.059129725171863434
