# DECISION TREE CLASSIFIER

### LOAD THE DATASET

#### Using the Bank Note Authentication Dataset - http://archive.ics.uci.edu/ml/datasets/banknote+authentication#

In [1]:
banknoteDataset = spark.read.csv("data_banknote_authentication.csv", sep=',',inferSchema=True)\
        .toDF('variance','skewness','curtosis','entropy','class')
print(banknoteDataset.head())
print(banknoteDataset.printSchema())

Row(variance=3.6216, skewness=8.6661, curtosis=-2.8073, entropy=-0.44699, class=0)
root
 |-- variance: double (nullable = true)
 |-- skewness: double (nullable = true)
 |-- curtosis: double (nullable = true)
 |-- entropy: double (nullable = true)
 |-- class: integer (nullable = true)

None


In [2]:
banknoteDataset.show(5)

+--------+--------+--------+--------+-----+
|variance|skewness|curtosis| entropy|class|
+--------+--------+--------+--------+-----+
|  3.6216|  8.6661| -2.8073|-0.44699|    0|
|  4.5459|  8.1674| -2.4586| -1.4621|    0|
|   3.866| -2.6383|  1.9242| 0.10645|    0|
|  3.4566|  9.5228| -4.0112| -3.5944|    0|
| 0.32924| -4.4552|  4.5718| -0.9888|    0|
+--------+--------+--------+--------+-----+
only showing top 5 rows



In [3]:
banknoteDataset.describe().show()

+-------+------------------+------------------+------------------+------------------+------------------+
|summary|          variance|          skewness|          curtosis|           entropy|             class|
+-------+------------------+------------------+------------------+------------------+------------------+
|  count|              1372|              1372|              1372|              1372|              1372|
|   mean|0.4337352570699707|1.9223531206393603|1.3976271172667651|-1.191656520043731|0.4446064139941691|
| stddev|2.8427625862785577| 5.869046743695513| 4.310030090106595| 2.101013137359609|0.4971032701256608|
|    min|           -7.0421|          -13.7731|           -5.2861|           -8.5482|                 0|
|    max|            6.8248|           12.9516|           17.9274|            2.4495|                 1|
+-------+------------------+------------------+------------------+------------------+------------------+



In [4]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

### TRANSFORM 4 FEATURES INTO 1 FEATURES COLUMN

In [5]:
vector_assembler = VectorAssembler(\
inputCols=['variance','skewness','curtosis','entropy'],\
outputCol="features")

In [6]:
df_temp = vector_assembler.transform(banknoteDataset)
df_temp.show(3)

+--------+--------+--------+--------+-----+--------------------+
|variance|skewness|curtosis| entropy|class|            features|
+--------+--------+--------+--------+-----+--------------------+
|  3.6216|  8.6661| -2.8073|-0.44699|    0|[3.6216,8.6661,-2...|
|  4.5459|  8.1674| -2.4586| -1.4621|    0|[4.5459,8.1674,-2...|
|   3.866| -2.6383|  1.9242| 0.10645|    0|[3.866,-2.6383,1....|
+--------+--------+--------+--------+-----+--------------------+
only showing top 3 rows



In [7]:
df = df_temp.drop('variance','skewness','curtosis','entropy')
df.show(3)

+-----+--------------------+
|class|            features|
+-----+--------------------+
|    0|[3.6216,8.6661,-2...|
|    0|[4.5459,8.1674,-2...|
|    0|[3.866,-2.6383,1....|
+-----+--------------------+
only showing top 3 rows



### INDEX THE CLASS LABELS INTO NUMERIC INDEXES

In [8]:
from pyspark.ml.feature import StringIndexer
l_indexer = StringIndexer(inputCol="class", outputCol="classIndex")

In [9]:
df = l_indexer.fit(df).transform(df)
df.show(5)

+-----+--------------------+----------+
|class|            features|classIndex|
+-----+--------------------+----------+
|    0|[3.6216,8.6661,-2...|       0.0|
|    0|[4.5459,8.1674,-2...|       0.0|
|    0|[3.866,-2.6383,1....|       0.0|
|    0|[3.4566,9.5228,-4...|       0.0|
|    0|[0.32924,-4.4552,...|       0.0|
+-----+--------------------+----------+
only showing top 5 rows



### SPLIT THE DATA INTO TRAINING AND TEST SETS [ HERE 70% IS TRAINING ]

In [10]:
(trainingData, testData) = banknoteDataset.randomSplit([0.7, 0.3])
print(trainingData.count())
print(testData.count())
testData.show()

965
407
+--------+--------+--------+--------+-----+
|variance|skewness|curtosis| entropy|class|
+--------+--------+--------+--------+-----+
| -7.0421|     9.2| 0.25933| -4.6832|    1|
| -6.3979|  6.4479|  1.0836| -6.6176|    1|
| -6.3364|  9.2848|0.014275| -6.7844|    1|
| -6.1536|  7.9295| 0.61663| -3.2646|    1|
| -5.8818|  7.6584|  0.5558| -2.9155|    1|
|  -5.637|  8.1261| 0.13081| -5.0142|    1|
| -5.4808|  8.1819| 0.27818| -5.0323|    1|
| -5.3857|  9.1214|-0.41929| -5.9181|    1|
| -5.3012|  7.3915|0.029699| -7.3987|    1|
| -5.2049|   7.259|0.070827| -7.3004|    1|
| -5.1661|  8.0433|0.044265| -4.4983|    1|
| -4.9462|  3.5716| 0.82742| -1.4957|    1|
| -4.8861|  7.0542|-0.17252|  -6.959|    1|
| -4.8392|  6.6755|-0.24278| -6.5775|    1|
| -4.7462|  3.1205|   1.075| -1.2966|    1|
| -4.6765| -5.6636|  10.969|-0.33449|    1|
| -4.5531|-12.5854| 15.4417| -1.4983|    1|
| -4.4996|  3.4288| 0.56265| -1.1672|    1|
| -4.4775|-13.0303| 17.0834| -3.0345|    1|
| -4.4018|-12.9371| 15.6

In [11]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

### TRAIN THE DECISION TREE CLASSIFIER MODEL

In [12]:
dt = DecisionTreeClassifier(labelCol="classIndex", featuresCol="features")

In [13]:
from pyspark.ml import Pipeline

### CHAIN INDEXERS AND TREE IN A PIPELINE

In [14]:
pipeline = Pipeline(stages=[vector_assembler,l_indexer, dt])

### TRAIN THE MODEL

In [15]:
pipelinemodel = pipeline.fit(trainingData)

### MAKE PREDICTIONS

In [16]:
predictions = pipelinemodel.transform(testData)

In [17]:
predictions.select("prediction", "classIndex", "features").show(5)

+----------+----------+--------------------+
|prediction|classIndex|            features|
+----------+----------+--------------------+
|       1.0|       1.0|[-7.0421,9.2,0.25...|
|       1.0|       1.0|[-6.3979,6.4479,1...|
|       1.0|       1.0|[-6.3364,9.2848,0...|
|       1.0|       1.0|[-6.1536,7.9295,0...|
|       1.0|       1.0|[-5.8818,7.6584,0...|
+----------+----------+--------------------+
only showing top 5 rows



### COMPUTE TEST ERROR

In [18]:
evaluator = MulticlassClassificationEvaluator(labelCol="classIndex", predictionCol="prediction",metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g " % (1.0 - accuracy))

Test Error = 0.02457 


In [19]:
print(pipelinemodel.stages[2])

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_435ebafcae3d) of depth 5 with 29 nodes


In [20]:
predictions.groupBy("classIndex","prediction").count().show()

+----------+----------+-----+
|classIndex|prediction|count|
+----------+----------+-----+
|       1.0|       1.0|  182|
|       0.0|       1.0|    2|
|       1.0|       0.0|    8|
|       0.0|       0.0|  215|
+----------+----------+-----+

