In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler

In [3]:
spark= SparkSession.builder.appName('possible_Asteroid_impacts').getOrCreate()

More info about Data: https://cneos.jpl.nasa.gov/sentry/intro.html

In [4]:
data = spark.read.csv('impacts.csv',header=True , inferSchema=True)

In [5]:
data.select("Asteroid Velocity","Asteroid Magnitude","Asteroid Diameter (km)","Maximum Palermo Scale").show(5)

+-----------------+------------------+----------------------+---------------------+
|Asteroid Velocity|Asteroid Magnitude|Asteroid Diameter (km)|Maximum Palermo Scale|
+-----------------+------------------+----------------------+---------------------+
|            17.77|              28.3|                 0.007|                -8.31|
|             8.98|              31.4|                 0.002|                -6.96|
|            18.33|              31.4|                 0.002|                -6.87|
|             4.99|              26.7|                 0.016|                -6.95|
|            19.46|              19.2|                 0.497|                 -4.3|
+-----------------+------------------+----------------------+---------------------+
only showing top 5 rows



In [6]:
trainDF,testDF=data.randomSplit([0.8,0.2])

In [7]:
vektör=VectorAssembler(inputCols=["Asteroid Velocity","Asteroid Magnitude","Asteroid Diameter (km)"]
                       ,outputCol="features")
vektörtrain=vektör.transform(trainDF)

In [8]:
lr=LinearRegression(featuresCol="features",labelCol="Maximum Palermo Scale",maxIter=10, regParam=0.5,elasticNetParam=1)

In [9]:
lrmodel=lr.fit(vektörtrain)

In [10]:
c_1=lrmodel.coefficients[0]
c_2=lrmodel.coefficients[1]
c_3=lrmodel.coefficients[2]
b=lrmodel.intercept
print("Features=",c_1,"*Asteroid Velocity","+",c_2,"*Asteroid Magnitude",c_3,"*Asteroid Diameter (km)","+",b)

Features= 0.0 *Asteroid Velocity + -0.02945728734794335 *Asteroid Magnitude 0.0 *Asteroid Diameter (km) + -6.008399232005006


In [11]:
from pyspark.ml import Pipeline
pipeline=Pipeline(stages=[vektör,lr])
pipelinemodel=pipeline.fit(trainDF)
sonuc=pipelinemodel.transform(testDF)
sonuc.select("Asteroid Velocity","Asteroid Magnitude","Asteroid Diameter (km)","features","Maximum Palermo Scale","prediction").show(10)

+-----------------+------------------+----------------------+------------------+---------------------+-------------------+
|Asteroid Velocity|Asteroid Magnitude|Asteroid Diameter (km)|          features|Maximum Palermo Scale|         prediction|
+-----------------+------------------+----------------------+------------------+---------------------+-------------------+
|             4.17|              27.1|                 0.013| [4.17,27.1,0.013]|                -7.75| -6.806691719134271|
|              4.9|              25.8|                 0.023|  [4.9,25.8,0.023]|                -5.38| -6.768397245581944|
|            30.68|              18.5|                 0.679|[30.68,18.5,0.679]|                -4.25| -6.553359047941958|
|             9.18|              24.9|                 0.036| [9.18,24.9,0.036]|                -7.59|-6.7418856869687955|
|            20.38|              21.4|                 0.179|[20.38,21.4,0.179]|                -4.92| -6.638785181250994|
|             8.

In [12]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(labelCol="Maximum Palermo Scale", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(sonuc)
print ("Root Mean Square Error (RMSE):", rmse)

Root Mean Square Error (RMSE): 1.409087028366552


In [13]:
lr_evaluator = RegressionEvaluator(labelCol="Maximum Palermo Scale",predictionCol="prediction",metricName="r2")
r2=lr_evaluator.evaluate(sonuc)
print("R Squared (R2) on test data = ",r2)

R Squared (R2) on test data =  0.0447396067322936
