In [2]:
from pyspark.sql import SparkSession
from pyspark import SparkFiles

In [3]:
spark = SparkSession.builder.appName('ML_example').getOrCreate()

In [9]:
training = spark.read.csv("test_ML.csv", header=True, inferSchema= True)

In [10]:
training.show()

+------+----+-----------+-------+
|Nombre|Edad|Experiencia|Salario|
+------+----+-----------+-------+
|  Pepe|  48|         10|  50000|
| Angel|  28|          8|  42000|
| Elisa|  12|          6|  30000|
|  Jose|  30|          5|  23000|
| Mario|  23|          3|  15300|
| Jesus|  12|          1|  12000|
+------+----+-----------+-------+



In [11]:
training.printSchema()

root
 |-- Nombre: string (nullable = true)
 |-- Edad: integer (nullable = true)
 |-- Experiencia: integer (nullable = true)
 |-- Salario: integer (nullable = true)



In [15]:
from pyspark.ml.feature import VectorAssembler
feature_assembler = VectorAssembler(inputCols=["Edad","Experiencia"], outputCol = "feature_vector")

In [16]:
output = feature_assembler.transform(training)

In [17]:
output.show()

+------+----+-----------+-------+--------------+
|Nombre|Edad|Experiencia|Salario|feature_vector|
+------+----+-----------+-------+--------------+
|  Pepe|  48|         10|  50000|   [48.0,10.0]|
| Angel|  28|          8|  42000|    [28.0,8.0]|
| Elisa|  12|          6|  30000|    [12.0,6.0]|
|  Jose|  30|          5|  23000|    [30.0,5.0]|
| Mario|  23|          3|  15300|    [23.0,3.0]|
| Jesus|  12|          1|  12000|    [12.0,1.0]|
+------+----+-----------+-------+--------------+



In [19]:
output.columns

['Nombre', 'Edad', 'Experiencia', 'Salario', 'feature_vector']

In [20]:
dataset = output.select("feature_vector","Salario")

In [21]:
dataset.show()

+--------------+-------+
|feature_vector|Salario|
+--------------+-------+
|   [48.0,10.0]|  50000|
|    [28.0,8.0]|  42000|
|    [12.0,6.0]|  30000|
|    [30.0,5.0]|  23000|
|    [23.0,3.0]|  15300|
|    [12.0,1.0]|  12000|
+--------------+-------+



In [30]:
from pyspark.ml.regression import LinearRegression
train_data,test_data = dataset.randomSplit([0.65,0.35])

In [31]:
train_data.show()

+--------------+-------+
|feature_vector|Salario|
+--------------+-------+
|    [12.0,1.0]|  12000|
|    [23.0,3.0]|  15300|
|    [30.0,5.0]|  23000|
+--------------+-------+



In [32]:
test_data.show()

+--------------+-------+
|feature_vector|Salario|
+--------------+-------+
|    [12.0,6.0]|  30000|
|    [28.0,8.0]|  42000|
|   [48.0,10.0]|  50000|
+--------------+-------+



In [38]:
regressor = LinearRegression(featuresCol="feature_vector",labelCol="Salario")
regressor = regressor.fit(train_data)

In [40]:
pred_results = regressor.evaluate(test_data)

In [41]:
pred_results.predictions.show()



+--------------+-------+------------------+
|feature_vector|Salario|        prediction|
+--------------+-------+------------------+
|    [12.0,6.0]|  30000|50499.999999996115|
|    [28.0,8.0]|  42000| 48299.99999999729|
|   [48.0,10.0]|  50000|41699.999999999156|
+--------------+-------+------------------+



In [42]:
pred_results.meanAbsoluteError,pred_results.meanSquaredError

(11699.999999998085, 176276666.66660684)