In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler

We will look at fifa19 data about football player potential.
We will predict potential of the the player according to age and overall of the player.


In [2]:
spark= SparkSession.builder.appName('rating_potential').getOrCreate()

In [3]:
data= spark.read.csv('data.csv',header=True , inferSchema=True)
data.select("Name","Age","Overall","Potential").show(5)

+-----------------+---+-------+---------+
|             Name|Age|Overall|Potential|
+-----------------+---+-------+---------+
|         L. Messi| 31|     94|       94|
|Cristiano Ronaldo| 33|     94|       94|
|        Neymar Jr| 26|     92|       93|
|           De Gea| 27|     91|       93|
|     K. De Bruyne| 27|     91|       92|
+-----------------+---+-------+---------+
only showing top 5 rows



In [4]:
data.describe("Age","Overall","Potential").show(5)

+-------+------------------+-----------------+------------------+
|summary|               Age|          Overall|         Potential|
+-------+------------------+-----------------+------------------+
|  count|             18207|            18207|             18207|
|   mean|25.122205745043114|66.23869940132916| 71.30729939034437|
| stddev| 4.669942713414315|6.908929611883186|6.1364955834991575|
|    min|                16|               46|                48|
|    max|                45|               94|                95|
+-------+------------------+-----------------+------------------+



In [5]:
#Divide data for training and testing.
trainDF,testDF=data.randomSplit([0.8,0.2])

In [6]:
#we will use features and label as a vector. 
vec=VectorAssembler(inputCols=["Age","Overall"]
                       ,outputCol="features")
vectrain=vec.transform(trainDF)

In [7]:
#For more information about parameters:https://spark.apache.org/docs/1.5.2/ml-linear-methods.html . I gave parameters
#for Lasso model
lr=LinearRegression(featuresCol="features",labelCol="Potential",maxIter=10, regParam=0.5,elasticNetParam=1)

In [8]:
lrmodel=lr.fit(vectrain)

In [9]:
#coefficients of our model. As we can see, age has a negative impact on potential.
c_1=lrmodel.coefficients[0]
c_2=lrmodel.coefficients[1]
b=lrmodel.intercept
print("Features=",c_1,"*Age","+",c_2,"*Overall","+",b)

Features= -0.7192412886566479 *Age + 0.7325602019075078 *Overall + 40.88598316664343


In [10]:
from pyspark.ml import Pipeline
pipeline=Pipeline(stages=[vec,lr])
pipelinemodel=pipeline.fit(trainDF)
sonuc=pipelinemodel.transform(testDF)
sonuc.select("Name","Age","Overall","features","Potential","prediction").show(10)

+-------------+---+-------+-----------+---------+-----------------+
|         Name|Age|Overall|   features|Potential|       prediction|
+-------------+---+-------+-----------+---------+-----------------+
| Sergio Ramos| 32|     91|[32.0,91.0]|       91|84.53324030321392|
|     D. Godín| 32|     90|[32.0,90.0]|       90| 83.8006801013064|
|  David Silva| 32|     90|[32.0,90.0]|       90| 83.8006801013064|
|      H. Kane| 24|     89|[24.0,89.0]|       91|88.82205020865207|
|    K. Mbappé| 19|     88|[19.0,88.0]|       95| 91.6856964500278|
|   L. Insigne| 27|     88|[27.0,88.0]|       88|85.93176614077463|
|   M. Hummels| 29|     88|[29.0,88.0]|       88|84.49328356346133|
|S. Handanovič| 33|     88|[33.0,88.0]|       88|81.61631840883473|
|    G. Buffon| 40|     88|[40.0,88.0]|       88| 76.5816293882382|
| K. Koulibaly| 27|     87|[27.0,87.0]|       90|85.19920593886711|
+-------------+---+-------+-----------+---------+-----------------+
only showing top 10 rows



In [11]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(labelCol="Potential", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(sonuc)
print ("Root Mean Square Error (RMSE):", rmse)

Root Mean Square Error (RMSE): 2.7168777514246507


In [12]:
lr_evaluator = RegressionEvaluator(labelCol="Potential",predictionCol="prediction",metricName="r2")
r2=lr_evaluator.evaluate(sonuc)
print("R Squared (R2) on test data = ",r2)
#R2 result looks very good.

R Squared (R2) on test data =  0.7983457417423532


In [13]:
from pyspark.sql.functions import corr


In [14]:
data.select(corr('potential','overall')).show()


+------------------------+
|corr(potential, overall)|
+------------------------+
|      0.6609385409937859|
+------------------------+



In [15]:
data.select(corr('potential','age')).show()


+--------------------+
|corr(potential, age)|
+--------------------+
| -0.2533121074109523|
+--------------------+

