# PySpark Tutorial Part VI: MLlib

In [11]:
from pyspark.sql import SparkSession

In [14]:
spark = SparkSession.builder.appName('ML').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/04/19 16:21:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/04/19 16:21:04 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [15]:
training = spark.read.csv('wines.csv', header=True, inferSchema=True)

In [17]:
training.show()

+----+-------------+----------------+-----------+--------------+
| _c0|fixed acidity|volatile acidity|citric acid|residual sugar|
+----+-------------+----------------+-----------+--------------+
|1665|          6.4|            0.26|       0.24|           6.4|
|1749|          7.9|            0.22|       0.38|           8.0|
|1774|          6.9|            0.23|        0.4|           7.5|
|1791|          6.8|            0.28|       0.36|           8.0|
|1802|          6.8|            0.26|       0.34|          13.9|
|1910|          5.0|            0.55|       0.14|           8.3|
|1917|          5.9|             0.3|       0.47|          7.85|
|1950|          7.3|            0.33|        0.4|          6.85|
|1952|          7.3|            0.33|        0.4|          6.85|
+----+-------------+----------------+-----------+--------------+



23/04/19 16:21:45 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , fixed acidity, volatile acidity, citric acid, residual sugar
 Schema: _c0, fixed acidity, volatile acidity, citric acid, residual sugar
Expected: _c0 but found: 
CSV file: file:///Users/robert/Desktop/DataProjects/PySpark%20Tutorial/wines.csv


In [18]:
training.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- fixed acidity: double (nullable = true)
 |-- volatile acidity: double (nullable = true)
 |-- citric acid: double (nullable = true)
 |-- residual sugar: double (nullable = true)



In [37]:
# ['fixed acidity', 'residual sugar']---> new feature--->independent features

In [38]:
from pyspark.ml.feature import VectorAssembler

In [39]:
featureassembler = VectorAssembler(inputCols=['fixed acidity', 'residual sugar'], outputCol="independentfeatures")

In [40]:
output = featureassembler.transform(training)

In [41]:
output.show()

+----+-------------+----------------+-----------+--------------+-------------------+
| _c0|fixed acidity|volatile acidity|citric acid|residual sugar|independentfeatures|
+----+-------------+----------------+-----------+--------------+-------------------+
|1665|          6.4|            0.26|       0.24|           6.4|          [6.4,6.4]|
|1749|          7.9|            0.22|       0.38|           8.0|          [7.9,8.0]|
|1774|          6.9|            0.23|        0.4|           7.5|          [6.9,7.5]|
|1791|          6.8|            0.28|       0.36|           8.0|          [6.8,8.0]|
|1802|          6.8|            0.26|       0.34|          13.9|         [6.8,13.9]|
|1910|          5.0|            0.55|       0.14|           8.3|          [5.0,8.3]|
|1917|          5.9|             0.3|       0.47|          7.85|         [5.9,7.85]|
|1950|          7.3|            0.33|        0.4|          6.85|         [7.3,6.85]|
|1952|          7.3|            0.33|        0.4|          6.85| 

23/04/19 16:33:32 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , fixed acidity, volatile acidity, citric acid, residual sugar
 Schema: _c0, fixed acidity, volatile acidity, citric acid, residual sugar
Expected: _c0 but found: 
CSV file: file:///Users/robert/Desktop/DataProjects/PySpark%20Tutorial/wines.csv


In [42]:
output.columns

['_c0',
 'fixed acidity',
 'volatile acidity',
 'citric acid',
 'residual sugar',
 'independentfeatures']

In [43]:
finaldata = output.select('independentfeatures', 'citric acid')

In [44]:
finaldata.show()

+-------------------+-----------+
|independentfeatures|citric acid|
+-------------------+-----------+
|          [6.4,6.4]|       0.24|
|          [7.9,8.0]|       0.38|
|          [6.9,7.5]|        0.4|
|          [6.8,8.0]|       0.36|
|         [6.8,13.9]|       0.34|
|          [5.0,8.3]|       0.14|
|         [5.9,7.85]|       0.47|
|         [7.3,6.85]|        0.4|
|         [7.3,6.85]|        0.4|
+-------------------+-----------+



In [46]:
from pyspark.ml.regression import LinearRegression
# train/test split
train_data, test_data = finaldata.randomSplit([0.75, 0.25])
regressor = LinearRegression(featuresCol='independentfeatures', labelCol='citric acid')
regressor=regressor.fit(train_data)

23/04/19 16:36:38 WARN Instrumentation: [2ac35279] regParam is zero, which might cause numerical instability and overfitting.
23/04/19 16:36:38 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/04/19 16:36:38 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


In [47]:
# coefficients
regressor.coefficients

DenseVector([0.0669, -0.0004])

In [48]:
# intercepts
regressor.intercept

-0.10210146923059968

In [50]:
# prediction
pred_results = regressor.evaluate(test_data)

In [51]:
pred_results.predictions.show()

+-------------------+-----------+-------------------+
|independentfeatures|citric acid|         prediction|
+-------------------+-----------+-------------------+
|          [6.9,7.5]|        0.4|0.35664259037787693|
+-------------------+-----------+-------------------+



In [52]:
pred_results.meanAbsoluteError,pred_results.meanSquaredError

(0.04335740962212309, 0.0018798649691405718)