<a href="https://colab.research.google.com/github/ralsouza/apache_spark_real_time_analytics/blob/master/notebooks/08_pyspark_mllib_decision_tree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [None]:
!apt-get update

In [4]:
# Install the dependencies
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz
!tar xf spark-2.4.4-bin-hadoop2.7.tgz
!pip install -q findspark

In [5]:
# Environment variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.4-bin-hadoop2.7"

In [6]:
# Make pyspark "importable"
import findspark
findspark.init('spark-2.4.4-bin-hadoop2.7')

In [7]:
# Libraries and Context Setup
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf

In [8]:
# create the session
conf = SparkConf().set("spark.ui.port", "4050")

# create the context
sc = pyspark.SparkContext(conf=conf)


# Instance Spark Session
spark = SparkSession.builder.master('local').appName('spark_ml_lib').getOrCreate()

# Create the SQL Context
sqlContext = pyspark.SQLContext(sc)

# Libraries


In [9]:
from pyspark.sql               import Row
from pyspark.ml.feature        import StringIndexer
from pyspark.ml.linalg         import Vectors
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation     import MulticlassClassificationEvaluator

# Load data

In [10]:
# Load as RDDs
rdd_iris = sc.textFile('/content/drive/My Drive/Colab Notebooks/08-apache-spark/data/mllib/iris.csv')

In [11]:
# Put the data in cache
rdd_iris.cache()

/content/drive/My Drive/Colab Notebooks/08-apache-spark/data/mllib/iris.csv MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0

In [11]:
# Count
rdd_iris.count()

151

In [12]:
# Show the the first five
rdd_iris.take(5)

['Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species',
 '5.1,3.5,1.4,0.2,setosa',
 '4.9,3,1.4,0.2,setosa',
 '4.7,3.2,1.3,0.2,setosa',
 '4.6,3.1,1.5,0.2,setosa']

In [13]:
# Remove the header
rdd_iris2 = rdd_iris.filter(lambda x: 'Sepal' not in x)
rdd_iris2.count()

150

# Data Cleansing

In [14]:
# Split columns by comma
rdd_iris3 = rdd_iris2.map(lambda x: x.split(','))

In [15]:
rdd_iris3.take(5)

[['5.1', '3.5', '1.4', '0.2', 'setosa'],
 ['4.9', '3', '1.4', '0.2', 'setosa'],
 ['4.7', '3.2', '1.3', '0.2', 'setosa'],
 ['4.6', '3.1', '1.5', '0.2', 'setosa'],
 ['5', '3.6', '1.4', '0.2', 'setosa']]

In [16]:
# Column mapping and converting data types
rdd_iris4 = rdd_iris3.map(lambda x: Row(SEPAL_LENGTH = float(x[0]), SEPAL_WIDTH = float(x[1]),
                                        PETAL_LENGTH = float(x[2]), PETAL_WIDTH = float(x[3]),
                                        SPECIES = x[4] ))

In [17]:
rdd_iris4.take(5)

[Row(PETAL_LENGTH=1.4, PETAL_WIDTH=0.2, SEPAL_LENGTH=5.1, SEPAL_WIDTH=3.5, SPECIES='setosa'),
 Row(PETAL_LENGTH=1.4, PETAL_WIDTH=0.2, SEPAL_LENGTH=4.9, SEPAL_WIDTH=3.0, SPECIES='setosa'),
 Row(PETAL_LENGTH=1.3, PETAL_WIDTH=0.2, SEPAL_LENGTH=4.7, SEPAL_WIDTH=3.2, SPECIES='setosa'),
 Row(PETAL_LENGTH=1.5, PETAL_WIDTH=0.2, SEPAL_LENGTH=4.6, SEPAL_WIDTH=3.1, SPECIES='setosa'),
 Row(PETAL_LENGTH=1.4, PETAL_WIDTH=0.2, SEPAL_LENGTH=5.0, SEPAL_WIDTH=3.6, SPECIES='setosa')]

In [18]:
# Dataframe creation
df_iris = spark.createDataFrame(rdd_iris4)

In [19]:
# Show dataframe
df_iris.show()

+------------+-----------+------------+-----------+-------+
|PETAL_LENGTH|PETAL_WIDTH|SEPAL_LENGTH|SEPAL_WIDTH|SPECIES|
+------------+-----------+------------+-----------+-------+
|         1.4|        0.2|         5.1|        3.5| setosa|
|         1.4|        0.2|         4.9|        3.0| setosa|
|         1.3|        0.2|         4.7|        3.2| setosa|
|         1.5|        0.2|         4.6|        3.1| setosa|
|         1.4|        0.2|         5.0|        3.6| setosa|
|         1.7|        0.4|         5.4|        3.9| setosa|
|         1.4|        0.3|         4.6|        3.4| setosa|
|         1.5|        0.2|         5.0|        3.4| setosa|
|         1.4|        0.2|         4.4|        2.9| setosa|
|         1.5|        0.1|         4.9|        3.1| setosa|
|         1.5|        0.2|         5.4|        3.7| setosa|
|         1.6|        0.2|         4.8|        3.4| setosa|
|         1.4|        0.1|         4.8|        3.0| setosa|
|         1.1|        0.1|         4.3| 

In [20]:
# Put the dataframe in cache
df_iris.cache()

DataFrame[PETAL_LENGTH: double, PETAL_WIDTH: double, SEPAL_LENGTH: double, SEPAL_WIDTH: double, SPECIES: string]

In [21]:
df_iris.take(5)

[Row(PETAL_LENGTH=1.4, PETAL_WIDTH=0.2, SEPAL_LENGTH=5.1, SEPAL_WIDTH=3.5, SPECIES='setosa'),
 Row(PETAL_LENGTH=1.4, PETAL_WIDTH=0.2, SEPAL_LENGTH=4.9, SEPAL_WIDTH=3.0, SPECIES='setosa'),
 Row(PETAL_LENGTH=1.3, PETAL_WIDTH=0.2, SEPAL_LENGTH=4.7, SEPAL_WIDTH=3.2, SPECIES='setosa'),
 Row(PETAL_LENGTH=1.5, PETAL_WIDTH=0.2, SEPAL_LENGTH=4.6, SEPAL_WIDTH=3.1, SPECIES='setosa'),
 Row(PETAL_LENGTH=1.4, PETAL_WIDTH=0.2, SEPAL_LENGTH=5.0, SEPAL_WIDTH=3.6, SPECIES='setosa')]

In [23]:
# Create an numeric index to the target label column
string_indexer = StringIndexer(inputCol='SPECIES',outputCol='IDX_SPECIES')
si_model = string_indexer.fit(df_iris)
df_iris_norm = si_model.transform(df_iris)

In [24]:
# Check the indexes
df_iris_norm.select('SPECIES','IDX_SPECIES').distinct().collect()

[Row(SPECIES='versicolor', IDX_SPECIES=0.0),
 Row(SPECIES='setosa', IDX_SPECIES=2.0),
 Row(SPECIES='virginica', IDX_SPECIES=1.0)]

# Exploratory Data Analysis

In [25]:
# Descriptive statistics
df_iris_norm.describe().show()

+-------+------------------+------------------+------------------+------------------+---------+------------------+
|summary|      PETAL_LENGTH|       PETAL_WIDTH|      SEPAL_LENGTH|       SEPAL_WIDTH|  SPECIES|       IDX_SPECIES|
+-------+------------------+------------------+------------------+------------------+---------+------------------+
|  count|               150|               150|               150|               150|      150|               150|
|   mean| 3.758000000000001|1.1993333333333331| 5.843333333333332|3.0573333333333337|     null|               1.0|
| stddev|1.7652982332594662|0.7622376689603467|0.8280661279778634|0.4358662849366978|     null|0.8192319205190404|
|    min|               1.0|               0.1|               4.3|               2.0|   setosa|               0.0|
|    max|               6.9|               2.5|               7.9|               4.4|virginica|               2.0|
+-------+------------------+------------------+------------------+--------------

In [28]:
# Show the correlation
for i in df_iris.columns:
  if not(isinstance(df_iris_norm.select(i).take(1)[0][0],str)):
    print('Correlation between IDX_SPECIES with ', i, df_iris_norm.stat.corr('IDX_SPECIES',i))

Correlation between IDX_SPECIES with  PETAL_LENGTH -0.649241830764174
Correlation between IDX_SPECIES with  PETAL_WIDTH -0.5803770334306263
Correlation between IDX_SPECIES with  SEPAL_LENGTH -0.46003915650023686
Correlation between IDX_SPECIES with  SEPAL_WIDTH 0.6183715308237434


# Data preprocessing

In [29]:
# Creating a LabeledPoint (target,Vector[features])
# Remove all nonrelevant columns or with low correlation
def transform_var(row):
  obj = (row['SPECIES'],row['IDX_SPECIES'],Vectors.dense([row['SEPAL_LENGTH'],row['SEPAL_WIDTH'],
                                                          row['PETAL_LENGTH'],row['PETAL_WIDTH']]))
  return obj

In [30]:
rdd_iris5 = df_iris_norm.rdd.map(transform_var)

In [31]:
rdd_iris5.take(5)

[('setosa', 2.0, DenseVector([5.1, 3.5, 1.4, 0.2])),
 ('setosa', 2.0, DenseVector([4.9, 3.0, 1.4, 0.2])),
 ('setosa', 2.0, DenseVector([4.7, 3.2, 1.3, 0.2])),
 ('setosa', 2.0, DenseVector([4.6, 3.1, 1.5, 0.2])),
 ('setosa', 2.0, DenseVector([5.0, 3.6, 1.4, 0.2]))]

In [34]:
df_iris2 = spark.createDataFrame(rdd_iris5,['SPECIES','LABEL','FEATURES'])
df_iris2.select('SPECIES','LABEL','FEATURES').show(10)
df_iris2.cache()

+-------+-----+-----------------+
|SPECIES|LABEL|         FEATURES|
+-------+-----+-----------------+
| setosa|  2.0|[5.1,3.5,1.4,0.2]|
| setosa|  2.0|[4.9,3.0,1.4,0.2]|
| setosa|  2.0|[4.7,3.2,1.3,0.2]|
| setosa|  2.0|[4.6,3.1,1.5,0.2]|
| setosa|  2.0|[5.0,3.6,1.4,0.2]|
| setosa|  2.0|[5.4,3.9,1.7,0.4]|
| setosa|  2.0|[4.6,3.4,1.4,0.3]|
| setosa|  2.0|[5.0,3.4,1.5,0.2]|
| setosa|  2.0|[4.4,2.9,1.4,0.2]|
| setosa|  2.0|[4.9,3.1,1.5,0.1]|
+-------+-----+-----------------+
only showing top 10 rows



DataFrame[SPECIES: string, LABEL: double, FEATURES: vector]

# Machine Learning

In [35]:
# Train and test data
(train_data, test_data) = df_iris2.randomSplit([0.7,0.3])

In [36]:
train_data.count()

99

In [37]:
test_data.count()

51

In [38]:
# Model creating
# MaxDepth parameter is the node number on the model
dt_classifer = DecisionTreeClassifier(maxDepth=2,labelCol='LABEL',featuresCol='FEATURES')
model = dt_classifer.fit(train_data)

In [40]:
model.depth

2

In [41]:
model.numNodes

5

In [43]:
# Presenting new data (test_data) to make the predictions
predictions = model.transform(test_data)
predictions.select('prediction','species','label').collect()

[Row(prediction=2.0, species='setosa', label=2.0),
 Row(prediction=2.0, species='setosa', label=2.0),
 Row(prediction=2.0, species='setosa', label=2.0),
 Row(prediction=2.0, species='setosa', label=2.0),
 Row(prediction=2.0, species='setosa', label=2.0),
 Row(prediction=2.0, species='setosa', label=2.0),
 Row(prediction=2.0, species='setosa', label=2.0),
 Row(prediction=2.0, species='setosa', label=2.0),
 Row(prediction=2.0, species='setosa', label=2.0),
 Row(prediction=2.0, species='setosa', label=2.0),
 Row(prediction=2.0, species='setosa', label=2.0),
 Row(prediction=2.0, species='setosa', label=2.0),
 Row(prediction=2.0, species='setosa', label=2.0),
 Row(prediction=2.0, species='setosa', label=2.0),
 Row(prediction=2.0, species='setosa', label=2.0),
 Row(prediction=2.0, species='setosa', label=2.0),
 Row(prediction=2.0, species='setosa', label=2.0),
 Row(prediction=2.0, species='setosa', label=2.0),
 Row(prediction=2.0, species='setosa', label=2.0),
 Row(prediction=2.0, species='s

In [48]:
# Accuracy evaluation
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction',labelCol='LABEL',metricName='accuracy')
evaluator.evaluate(predictions)

0.9215686274509803

In [49]:
predictions.groupby('LABEL','prediction').count().show()

+-----+----------+-----+
|LABEL|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|   12|
|  0.0|       1.0|    2|
|  2.0|       2.0|   21|
|  1.0|       0.0|    2|
|  0.0|       0.0|   14|
+-----+----------+-----+

