In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler

In [3]:
spark= SparkSession.builder.appName('rating_potential').getOrCreate()

In [4]:
data = spark.read.csv('data.csv',header=True , inferSchema=True)
data.select("Age","Overall","Potential").show(5)

+---+-------+---------+
|Age|Overall|Potential|
+---+-------+---------+
| 31|     94|       94|
| 33|     94|       94|
| 26|     92|       93|
| 27|     91|       93|
| 27|     91|       92|
+---+-------+---------+
only showing top 5 rows



In [5]:
trainDF,testDF=data.randomSplit([0.8,0.2])

In [6]:
vektör=VectorAssembler(inputCols=["Age","Overall"]
                       ,outputCol="features")
vektörtrain=vektör.transform(trainDF)

In [7]:
lr=LinearRegression(featuresCol="features",labelCol="Potential",maxIter=10, regParam=0.5,elasticNetParam=1)

In [8]:
lrmodel=lr.fit(vektörtrain)

In [9]:
c_1=lrmodel.coefficients[0]
c_2=lrmodel.coefficients[1]
b=lrmodel.intercept
print("Features=",c_1,"*Age","+",c_2,"*Overall","+",b)

Features= -0.712136075614766 *Age + 0.7328735874848807 *Overall + 40.639620188224946


In [10]:
from pyspark.ml import Pipeline
pipeline=Pipeline(stages=[vektör,lr])
pipelinemodel=pipeline.fit(trainDF)
sonuc=pipelinemodel.transform(testDF)
sonuc.select("Age","Overall","features","Potential","prediction").show(10)

+---+-------+-----------+---------+-----------------+
|Age|Overall|   features|Potential|       prediction|
+---+-------+-----------+---------+-----------------+
| 33|     94|[33.0,94.0]|       94|86.02924691651646|
| 27|     91|[27.0,91.0]|       91|88.10344260775041|
| 32|     91|[32.0,91.0]|       91|84.54276222967658|
| 27|     89|[27.0,89.0]|       90|86.63769543278065|
| 24|     89|[24.0,89.0]|       94|88.77410365962496|
| 33|     89|[33.0,89.0]|       89|82.36487897909205|
| 19|     88|[19.0,88.0]|       95| 91.6019104502139|
| 26|     88|[26.0,88.0]|       90|86.61695792091054|
| 26|     88|[26.0,88.0]|       91|86.61695792091054|
| 27|     87|[27.0,87.0]|       90| 85.1719482578109|
+---+-------+-----------+---------+-----------------+
only showing top 10 rows



In [11]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(labelCol="Potential", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(sonuc)
print ("Root Mean Square Error (RMSE):", rmse)

Root Mean Square Error (RMSE): 2.793463044656607


In [12]:
lr_evaluator = RegressionEvaluator(labelCol="Potential",predictionCol="prediction",metricName="r2")
r2=lr_evaluator.evaluate(sonuc)
print("R Squared (R2) on test data = ",r2)

R Squared (R2) on test data =  0.7995829036765745
