In [1]:
import findspark
findspark.init()

In [2]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import split
from pyspark.ml.feature import StringIndexer, IndexToString
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import DecisionTreeClassifier

In [4]:
data_path = "./"

In [5]:
spark = SparkSession.builder.master("local[*]").appName("Assignment").getOrCreate()

In [6]:
orig_data = spark.read.format("csv").options(sep=',',header='true',inferschema='true').load(data_path+"diabetes-dataset.csv")

In [7]:
indexer = StringIndexer(inputCol="Outcome", outputCol="label").fit(orig_data)
label_data = indexer.transform(orig_data)

# Save the inverse map from numeric "label" to string "class" to be used further in response
labelReverse = IndexToString().setInputCol("label")

# Show labeled dataframe with numeric lable
print("Dataframe with numeric lable")
label_data.show(5)

Dataframe with numeric lable
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+-----+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|label|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+-----+
|          2|    138|           62|           35|      0|33.6|                   0.127| 47|      1|  1.0|
|          0|     84|           82|           31|    125|38.2|                   0.233| 23|      0|  0.0|
|          0|    145|            0|            0|      0|44.2|                    0.63| 31|      1|  1.0|
|          0|    135|           68|           42|    250|42.3|                   0.365| 24|      1|  1.0|
|          1|    139|           62|           41|    480|40.7|                   0.536| 21|      0|  0.0|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+-----+
only showing top 

In [8]:
# Drop string column "class", no string column
label_data = label_data.drop("Outcome")

# Most Machine Learning Lib inpute 2 columns: label (output) and feature (input)
# The label column is the result to train ML algorithm 
# The feature column should join all parameters as a Vector

# Set the column names that is not part of features list
ignore = ['label']
# list will be all columns parts of features
list = [x for x in label_data.columns if x not in ignore]

# VectorAssembler mount the vector of features
assembler = VectorAssembler(
            inputCols=list,
            outputCol='features')

# Create final dataframe composed by label and a column of features vector
data = (assembler.transform(label_data).select("label","features"))

print("Final Dataframe suitable to classifier input format")
#data.printSchema()
data.show(5)

Final Dataframe suitable to classifier input format
+-----+--------------------+
|label|            features|
+-----+--------------------+
|  1.0|[2.0,138.0,62.0,3...|
|  0.0|[0.0,84.0,82.0,31...|
|  1.0|(8,[1,5,6,7],[145...|
|  1.0|[0.0,135.0,68.0,4...|
|  0.0|[1.0,139.0,62.0,4...|
+-----+--------------------+
only showing top 5 rows



In [9]:
train_sample = 0.7
test_sample = 0.3
(train, test) = data.randomSplit([train_sample, test_sample], 1234)

In [10]:
trainer = DecisionTreeClassifier(featuresCol='features', labelCol='label', predictionCol='prediction', probabilityCol='probability',\
                                 rawPredictionCol='rawPrediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,\
                                 maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity='gini', seed=None)

#trainer = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True)

# train the model and get the result
model = trainer.fit(train)
result_dt = model.transform(test)

In [11]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",\
            metricName="accuracy")

accuracy_dt = evaluator.evaluate(result_dt) * 100
print("Decision Tree: accuracy = %3.1f %%" % accuracy_dt)

Decision Tree: accuracy = 79.2 %


In [12]:
print("Decision Tree Final Result")
result_dt.show(5)

Decision Tree Final Result
+-----+--------------------+-------------+--------------------+----------+
|label|            features|rawPrediction|         probability|prediction|
+-----+--------------------+-------------+--------------------+----------+
|  0.0|(8,[0,1,6,7],[2.0...|  [283.0,1.0]|[0.99647887323943...|       0.0|
|  0.0|(8,[0,1,6,7],[2.0...|  [283.0,1.0]|[0.99647887323943...|       0.0|
|  0.0|(8,[0,1,6,7],[3.0...|  [283.0,1.0]|[0.99647887323943...|       0.0|
|  0.0|(8,[1,6,7],[94.0,...|  [283.0,1.0]|[0.99647887323943...|       0.0|
|  0.0|[0.0,57.0,60.0,0....| [220.0,71.0]|[0.75601374570446...|       0.0|
+-----+--------------------+-------------+--------------------+----------+
only showing top 5 rows

