### Tutorial on K-Means clustering algorithm

#### For more information please visit:
##### https://spark.apache.org/docs/latest/ml-classification-regression.html#decision-trees

##### http://spark.apache.org/docs/2.0.0/api/python/_modules/pyspark/ml/evaluation.html


In [3]:
# Call all the imports
from pyspark.sql import SparkSession
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier, GBTClassifier
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

In [4]:
# Create a spark session
spark = SparkSession.builder.appName('trees').getOrCreate()

In [5]:
# work on libsvm formatted data
data = spark.read.format("libsvm").load("sample_libsvm_data.txt")

In [6]:
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



In [9]:
data.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



In [10]:
# Split the data into train and test.
# The ML models are trained on the training data and evaluated on the test data for performance
train_data, test_data = data.randomSplit([0.7,0.3])

### This notebook describes 3 types of Tree based classifiers
1. Decision Tree
2. Random Forest
3. Gradient Boosted Trees
<br>

The Random Forest and Gradient Boosted Trees are variants of Decision Trees. These 2 algorithms treat the basic Decision Tree algorithm as a weak learner, combines multiple trees and builds a classifier.
<br>

Please refer the documentation for more details regarding the arguments.

In [18]:
# Create the 3 types of classifiers

dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier(numTrees=75)
gbt = GBTClassifier()

In [13]:
# Build a model on the training data
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)

In [14]:
# Find out the prediction
dtc_preds = dtc_model.transform(test_data)
rfc_preds = rfc_model.transform(test_data)
gbt_preds = gbt_model.transform(test_data)

In [15]:
dtc_preds.show()

+-----+--------------------+-------------+-----------+----------+
|label|            features|rawPrediction|probability|prediction|
+-----+--------------------+-------------+-----------+----------+
|  0.0|(692,[98,99,100,1...|   [25.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[121,122,123...|   [25.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[122,123,124...|   [25.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[123,124,125...|   [25.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|   [25.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|   [25.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|   [25.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [25.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [25.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [25.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[128,129,130...|   [25.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[151,152,153...|   [25.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(69

In [16]:
gbt_preds.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[98,99,100,1...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[121,122,123...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[122,123,124...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[123,124,125...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[124,125,126...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[124,125,126...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[124,125,126...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[126,127,128...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[126,127,128...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[126

### Evaluators
Now that the model has predicted on the test data, let us perform some evaluations. 
1. Pyspark supports BinaryClassificationEvaluator and MulticlassClassificationEvaluator for 
performing evaluations for classification algorithms
2. One can measure metrics such as the Area under the curve, accuracy, precision and recall on the predictions and the input labels


In [19]:
binary_eval = BinaryClassificationEvaluator(metricName='areaUnderROC')

In [20]:
dtc_eval = binary_eval.evaluate(dtc_preds)
print("DTC AUC: ", dtc_eval)

DTC AUC:  0.9444444444444444


In [21]:
rfc_eval = binary_eval.evaluate(rfc_preds)
print("RFC AUC: ", rfc_eval)

RFC AUC:  1.0


In [22]:
gbt_eval = binary_eval.evaluate(gbt_preds)
print("GBT AUC: ", gbt_eval)

GBT AUC:  0.9444444444444444


In [23]:
# Perform evaluation on the predictions
multiclass_eval = MulticlassClassificationEvaluator(metricName='accuracy')

In [24]:
dtc_eval = multiclass_eval.evaluate(dtc_preds)
print("DTC accuracy: ", dtc_eval)

DTC accuracy:  0.9444444444444444


In [25]:
rfc_eval = multiclass_eval.evaluate(rfc_preds)
print("RFC accuracy: ", rfc_eval)

RFC accuracy:  1.0


In [26]:
gbt_eval = multiclass_eval.evaluate(gbt_preds)
print("GBT accuracy: ", gbt_eval)

GBT accuracy:  0.9444444444444444


### We see that for the above data, Random Forest does a really good job as a classifier

### Working on College dataset
#### The dataset contains statistics on colleges and the aim of the experiment is to determine whether the college is private or not 

In [27]:
data = spark.read.csv("College.csv", inferSchema=True, header=True)

In [28]:
data.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)



In [29]:
data.head(1)

[Row(School='Abilene Christian University', Private='Yes', Apps=1660, Accept=1232, Enroll=721, Top10perc=23, Top25perc=52, F_Undergrad=2885, P_Undergrad=537, Outstate=7440, Room_Board=3300, Books=450, Personal=2200, PhD=70, Terminal=78, S_F_Ratio=18.1, perc_alumni=12, Expend=7041, Grad_Rate=60)]

In [30]:
data.columns

['School',
 'Private',
 'Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate']

In [31]:
# Create a features column using assember
assembler=VectorAssembler(inputCols=['Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate'], outputCol='features')

features_data = assembler.transform(data)
features_data.show()

+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+--------------------+
|              School|Private|Apps|Accept|Enroll|Top10perc|Top25perc|F_Undergrad|P_Undergrad|Outstate|Room_Board|Books|Personal|PhD|Terminal|S_F_Ratio|perc_alumni|Expend|Grad_Rate|            features|
+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+--------------------+
|Abilene Christian...|    Yes|1660|  1232|   721|       23|       52|       2885|        537|    7440|      3300|  450|    2200| 70|      78|     18.1|         12|  7041|       60|[1660.0,1232.0,72...|
|  Adelphi University|    Yes|2186|  1924|   512|       16|       29|       2683|       1227|   12280|      6450|  750|    1500| 29|      30|     12.2|         16| 10527|       56|[2186.0,1924

In [32]:
print("Number of rows: ", features_data.count())

Number of rows:  777


In [33]:
# Convert the predictions column from string to index
private_idx = StringIndexer(inputCol='Private', outputCol='label')
final_data = private_idx.fit(features_data).transform(features_data)
final_data.show()

+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+--------------------+-----+
|              School|Private|Apps|Accept|Enroll|Top10perc|Top25perc|F_Undergrad|P_Undergrad|Outstate|Room_Board|Books|Personal|PhD|Terminal|S_F_Ratio|perc_alumni|Expend|Grad_Rate|            features|label|
+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+--------------------+-----+
|Abilene Christian...|    Yes|1660|  1232|   721|       23|       52|       2885|        537|    7440|      3300|  450|    2200| 70|      78|     18.1|         12|  7041|       60|[1660.0,1232.0,72...|  0.0|
|  Adelphi University|    Yes|2186|  1924|   512|       16|       29|       2683|       1227|   12280|      6450|  750|    1500| 29|      30|     12.2|         16| 1052

In [34]:
final_data.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- label: double (nullable = true)



In [35]:
final_data_truncated = final_data.select(['features','label'])

In [36]:
final_data_truncated.show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[1660.0,1232.0,72...|  0.0|
|[2186.0,1924.0,51...|  0.0|
|[1428.0,1097.0,33...|  0.0|
|[417.0,349.0,137....|  0.0|
|[193.0,146.0,55.0...|  0.0|
|[587.0,479.0,158....|  0.0|
|[353.0,340.0,103....|  0.0|
|[1899.0,1720.0,48...|  0.0|
|[1038.0,839.0,227...|  0.0|
|[582.0,498.0,172....|  0.0|
|[1732.0,1425.0,47...|  0.0|
|[2652.0,1900.0,48...|  0.0|
|[1179.0,780.0,290...|  0.0|
|[1267.0,1080.0,38...|  0.0|
|[494.0,313.0,157....|  0.0|
|[1420.0,1093.0,22...|  0.0|
|[4302.0,992.0,418...|  0.0|
|[1216.0,908.0,423...|  0.0|
|[1130.0,704.0,322...|  0.0|
|[3540.0,2001.0,10...|  1.0|
+--------------------+-----+
only showing top 20 rows



In [37]:
# Split the data into train and test
train_data, test_data = final_data_truncated.randomSplit([0.7,0.3])
print('Training set size: ', train_data.count())
print('Test set size: ', test_data.count())

Training set size:  532
Test set size:  245


In [38]:
# Create the 3 types of classifiers
dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier(numTrees=75)
gbt = GBTClassifier()

In [39]:
# Build a model on the training data
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)

In [40]:
# Find out the prediction
dtc_preds = dtc_model.transform(test_data)
rfc_preds = rfc_model.transform(test_data)
gbt_preds = gbt_model.transform(test_data)

In [41]:
dtc_preds.show()

+--------------------+-----+-------------+--------------------+----------+
|            features|label|rawPrediction|         probability|prediction|
+--------------------+-----+-------------+--------------------+----------+
|[222.0,185.0,91.0...|  0.0|  [284.0,0.0]|           [1.0,0.0]|       0.0|
|[232.0,216.0,106....|  0.0|   [20.0,2.0]|[0.90909090909090...|       0.0|
|[233.0,233.0,153....|  1.0|   [10.0,0.0]|           [1.0,0.0]|       0.0|
|[235.0,217.0,121....|  0.0|  [284.0,0.0]|           [1.0,0.0]|       0.0|
|[247.0,189.0,100....|  0.0|  [284.0,0.0]|           [1.0,0.0]|       0.0|
|[261.0,192.0,111....|  0.0|  [284.0,0.0]|           [1.0,0.0]|       0.0|
|[285.0,280.0,208....|  1.0|    [0.0,5.0]|           [0.0,1.0]|       1.0|
|[291.0,245.0,126....|  0.0|  [284.0,0.0]|           [1.0,0.0]|       0.0|
|[313.0,228.0,137....|  0.0|  [284.0,0.0]|           [1.0,0.0]|       0.0|
|[335.0,284.0,132....|  0.0|  [284.0,0.0]|           [1.0,0.0]|       0.0|
|[342.0,254.0,126....|  0

In [42]:
rfc_preds.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[222.0,185.0,91.0...|  0.0|[74.75950205586,0...|[0.9967933607448,...|       0.0|
|[232.0,216.0,106....|  0.0|[67.8327047952005...|[0.90443606393600...|       0.0|
|[233.0,233.0,153....|  1.0|[42.3218385249059...|[0.56429118033207...|       0.0|
|[235.0,217.0,121....|  0.0|[74.745621865495,...|[0.99660829153993...|       0.0|
|[247.0,189.0,100....|  0.0|[74.6160273273125...|[0.99488036436416...|       0.0|
|[261.0,192.0,111....|  0.0|[74.6399986957796...|[0.99519998261039...|       0.0|
|[285.0,280.0,208....|  1.0|[31.7145458302982...|[0.42286061107064...|       1.0|
|[291.0,245.0,126....|  0.0|[71.6173496968159...|[0.95489799595754...|       0.0|
|[313.0,228.0,137....|  0.0|[74.7065801067180...|[0.99608773475624...|       0.0|
|[335.0,284.0,13

In [43]:
gbt_preds.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[222.0,185.0,91.0...|  0.0|[1.53962092213656...|[0.95602832414333...|       0.0|
|[232.0,216.0,106....|  0.0|[1.22810719081675...|[0.92101471046073...|       0.0|
|[233.0,233.0,153....|  1.0|[-0.1081136393004...|[0.44615281571216...|       1.0|
|[235.0,217.0,121....|  0.0|[1.53962092213656...|[0.95602832414333...|       0.0|
|[247.0,189.0,100....|  0.0|[1.53962092213656...|[0.95602832414333...|       0.0|
|[261.0,192.0,111....|  0.0|[1.53962092213656...|[0.95602832414333...|       0.0|
|[285.0,280.0,208....|  1.0|[-1.2135084256339...|[0.08113559101401...|       1.0|
|[291.0,245.0,126....|  0.0|[1.54007676506976...|[0.95606663381649...|       0.0|
|[313.0,228.0,137....|  0.0|[1.53962092213656...|[0.95602832414333...|       0.0|
|[335.0,284.0,13

In [44]:
# Perform evaluation on the predictions
multiclass_eval = MulticlassClassificationEvaluator(metricName='accuracy')

In [45]:
dtc_eval = multiclass_eval.evaluate(dtc_preds)
print("DTC accuracy: ", dtc_eval)

DTC accuracy:  0.9183673469387755


In [46]:
rfc_eval = multiclass_eval.evaluate(rfc_preds)
print("RFC accuracy: ", rfc_eval)

RFC accuracy:  0.9346938775510204


In [47]:
GBT_eval = multiclass_eval.evaluate(gbt_preds)
print("GBT accuracy: ", gbt_eval)

GBT accuracy:  0.9444444444444444


### Working with Dog Food dataset
#### Based on the chemical's present in the meal, determine whether this food is spoilt or not

In [48]:
data = spark.read.csv("dog_food.csv", inferSchema=True, header=True)

In [49]:
data.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)



In [50]:
data.show()

+---+---+----+---+-------+
|  A|  B|   C|  D|Spoiled|
+---+---+----+---+-------+
|  4|  2|12.0|  3|    1.0|
|  5|  6|12.0|  7|    1.0|
|  6|  2|13.0|  6|    1.0|
|  4|  2|12.0|  1|    1.0|
|  4|  2|12.0|  3|    1.0|
| 10|  3|13.0|  9|    1.0|
|  8|  5|14.0|  5|    1.0|
|  5|  8|12.0|  8|    1.0|
|  6|  5|12.0|  9|    1.0|
|  3|  3|12.0|  1|    1.0|
|  9|  8|11.0|  3|    1.0|
|  1| 10|12.0|  3|    1.0|
|  1|  5|13.0| 10|    1.0|
|  2| 10|12.0|  6|    1.0|
|  1| 10|11.0|  4|    1.0|
|  5|  3|12.0|  2|    1.0|
|  4|  9|11.0|  8|    1.0|
|  5|  1|11.0|  1|    1.0|
|  4|  9|12.0| 10|    1.0|
|  5|  8|10.0|  9|    1.0|
+---+---+----+---+-------+
only showing top 20 rows



In [51]:
data.head(2)

[Row(A=4, B=2, C=12.0, D=3, Spoiled=1.0),
 Row(A=5, B=6, C=12.0, D=7, Spoiled=1.0)]

In [52]:
data.columns

['A', 'B', 'C', 'D', 'Spoiled']

In [53]:
# Create a features column using assember
assembler=VectorAssembler(inputCols=['A', 'B', 'C', 'D'], outputCol='features')

features_data = assembler.transform(data)
features_data.show()

+---+---+----+---+-------+-------------------+
|  A|  B|   C|  D|Spoiled|           features|
+---+---+----+---+-------+-------------------+
|  4|  2|12.0|  3|    1.0| [4.0,2.0,12.0,3.0]|
|  5|  6|12.0|  7|    1.0| [5.0,6.0,12.0,7.0]|
|  6|  2|13.0|  6|    1.0| [6.0,2.0,13.0,6.0]|
|  4|  2|12.0|  1|    1.0| [4.0,2.0,12.0,1.0]|
|  4|  2|12.0|  3|    1.0| [4.0,2.0,12.0,3.0]|
| 10|  3|13.0|  9|    1.0|[10.0,3.0,13.0,9.0]|
|  8|  5|14.0|  5|    1.0| [8.0,5.0,14.0,5.0]|
|  5|  8|12.0|  8|    1.0| [5.0,8.0,12.0,8.0]|
|  6|  5|12.0|  9|    1.0| [6.0,5.0,12.0,9.0]|
|  3|  3|12.0|  1|    1.0| [3.0,3.0,12.0,1.0]|
|  9|  8|11.0|  3|    1.0| [9.0,8.0,11.0,3.0]|
|  1| 10|12.0|  3|    1.0|[1.0,10.0,12.0,3.0]|
|  1|  5|13.0| 10|    1.0|[1.0,5.0,13.0,10.0]|
|  2| 10|12.0|  6|    1.0|[2.0,10.0,12.0,6.0]|
|  1| 10|11.0|  4|    1.0|[1.0,10.0,11.0,4.0]|
|  5|  3|12.0|  2|    1.0| [5.0,3.0,12.0,2.0]|
|  4|  9|11.0|  8|    1.0| [4.0,9.0,11.0,8.0]|
|  5|  1|11.0|  1|    1.0| [5.0,1.0,11.0,1.0]|
|  4|  9|12.0

In [54]:
final_data = features_data.select(['features','Spoiled'])
final_data.show()

+-------------------+-------+
|           features|Spoiled|
+-------------------+-------+
| [4.0,2.0,12.0,3.0]|    1.0|
| [5.0,6.0,12.0,7.0]|    1.0|
| [6.0,2.0,13.0,6.0]|    1.0|
| [4.0,2.0,12.0,1.0]|    1.0|
| [4.0,2.0,12.0,3.0]|    1.0|
|[10.0,3.0,13.0,9.0]|    1.0|
| [8.0,5.0,14.0,5.0]|    1.0|
| [5.0,8.0,12.0,8.0]|    1.0|
| [6.0,5.0,12.0,9.0]|    1.0|
| [3.0,3.0,12.0,1.0]|    1.0|
| [9.0,8.0,11.0,3.0]|    1.0|
|[1.0,10.0,12.0,3.0]|    1.0|
|[1.0,5.0,13.0,10.0]|    1.0|
|[2.0,10.0,12.0,6.0]|    1.0|
|[1.0,10.0,11.0,4.0]|    1.0|
| [5.0,3.0,12.0,2.0]|    1.0|
| [4.0,9.0,11.0,8.0]|    1.0|
| [5.0,1.0,11.0,1.0]|    1.0|
|[4.0,9.0,12.0,10.0]|    1.0|
| [5.0,8.0,10.0,9.0]|    1.0|
+-------------------+-------+
only showing top 20 rows



In [55]:
# Split the data into train and test
train_data, test_data = final_data.randomSplit([0.7,0.3])
print('Training set size: ', train_data.count())
print('Test set size: ', test_data.count())

Training set size:  360
Test set size:  130


In [56]:
# Create the 3 types of classifiers
dtc = DecisionTreeClassifier(labelCol='Spoiled')
rfc = RandomForestClassifier(labelCol='Spoiled')
gbt = GBTClassifier(labelCol='Spoiled')

In [57]:
# Build a model on the training data
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)

In [58]:
# Find out the prediction
dtc_preds = dtc_model.transform(test_data)
rfc_preds = rfc_model.transform(test_data)
gbt_preds = gbt_model.transform(test_data)

In [59]:
dtc_preds.show()

+-------------------+-------+-------------+-----------+----------+
|           features|Spoiled|rawPrediction|probability|prediction|
+-------------------+-------+-------------+-----------+----------+
|  [1.0,5.0,8.0,3.0]|    0.0|  [201.0,0.0]|  [1.0,0.0]|       0.0|
|[1.0,6.0,11.0,10.0]|    1.0|   [0.0,10.0]|  [0.0,1.0]|       1.0|
|  [1.0,7.0,7.0,6.0]|    0.0|  [201.0,0.0]|  [1.0,0.0]|       0.0|
|  [1.0,8.0,6.0,6.0]|    0.0|  [201.0,0.0]|  [1.0,0.0]|       0.0|
|  [1.0,8.0,8.0,7.0]|    0.0|  [201.0,0.0]|  [1.0,0.0]|       0.0|
|  [1.0,9.0,7.0,4.0]|    0.0|  [201.0,0.0]|  [1.0,0.0]|       0.0|
|[1.0,10.0,11.0,4.0]|    1.0|   [0.0,78.0]|  [0.0,1.0]|       1.0|
|  [2.0,1.0,7.0,9.0]|    0.0|  [201.0,0.0]|  [1.0,0.0]|       0.0|
|  [2.0,1.0,9.0,1.0]|    0.0|    [6.0,0.0]|  [1.0,0.0]|       0.0|
| [2.0,2.0,6.0,10.0]|    0.0|  [201.0,0.0]|  [1.0,0.0]|       0.0|
|  [2.0,2.0,8.0,1.0]|    0.0|    [6.0,0.0]|  [1.0,0.0]|       0.0|
|  [2.0,2.0,9.0,8.0]|    0.0|  [201.0,0.0]|  [1.0,0.0]|       

In [60]:
rfc_preds.show()

+-------------------+-------+--------------------+--------------------+----------+
|           features|Spoiled|       rawPrediction|         probability|prediction|
+-------------------+-------+--------------------+--------------------+----------+
|  [1.0,5.0,8.0,3.0]|    0.0|[19.6785714285714...|[0.98392857142857...|       0.0|
|[1.0,6.0,11.0,10.0]|    1.0|          [0.5,19.5]|       [0.025,0.975]|       1.0|
|  [1.0,7.0,7.0,6.0]|    0.0|[19.9545454545454...|[0.99772727272727...|       0.0|
|  [1.0,8.0,6.0,6.0]|    0.0|[19.9545454545454...|[0.99772727272727...|       0.0|
|  [1.0,8.0,8.0,7.0]|    0.0|[19.9545454545454...|[0.99772727272727...|       0.0|
|  [1.0,9.0,7.0,4.0]|    0.0|          [20.0,0.0]|           [1.0,0.0]|       0.0|
|[1.0,10.0,11.0,4.0]|    1.0|          [1.0,19.0]|         [0.05,0.95]|       1.0|
|  [2.0,1.0,7.0,9.0]|    0.0|          [20.0,0.0]|           [1.0,0.0]|       0.0|
|  [2.0,1.0,9.0,1.0]|    0.0|[19.0857142857142...|[0.95428571428571...|       0.0|
| [2

In [61]:
gbt_preds.show()

+-------------------+-------+--------------------+--------------------+----------+
|           features|Spoiled|       rawPrediction|         probability|prediction|
+-------------------+-------+--------------------+--------------------+----------+
|  [1.0,5.0,8.0,3.0]|    0.0|[1.54353023554322...|[0.95635583547248...|       0.0|
|[1.0,6.0,11.0,10.0]|    1.0|[-1.5435020027249...|[0.04364652142729...|       1.0|
|  [1.0,7.0,7.0,6.0]|    0.0|[1.54353023554322...|[0.95635583547248...|       0.0|
|  [1.0,8.0,6.0,6.0]|    0.0|[1.54353023554322...|[0.95635583547248...|       0.0|
|  [1.0,8.0,8.0,7.0]|    0.0|[1.54353023554322...|[0.95635583547248...|       0.0|
|  [1.0,9.0,7.0,4.0]|    0.0|[1.54353023554322...|[0.95635583547248...|       0.0|
|[1.0,10.0,11.0,4.0]|    1.0|[-1.5435020027249...|[0.04364652142729...|       1.0|
|  [2.0,1.0,7.0,9.0]|    0.0|[1.54052534090612...|[0.95610430166578...|       0.0|
|  [2.0,1.0,9.0,1.0]|    0.0|[1.23210623028690...|[0.92159458790110...|       0.0|
| [2

In [62]:
# Perform evaluation on the predictions
multiclass_eval = MulticlassClassificationEvaluator(labelCol='Spoiled', metricName='accuracy')

In [63]:
dtc_eval = multiclass_eval.evaluate(dtc_preds)
print("DTC accuracy: ", dtc_eval)

DTC accuracy:  0.9769230769230769


In [64]:
rfc_eval = multiclass_eval.evaluate(rfc_preds)
print("RFC accuracy: ", rfc_eval)

RFC accuracy:  0.9769230769230769


In [65]:
gbt_eval = multiclass_eval.evaluate(gbt_preds)
print("GBT accuracy: ", gbt_eval)

GBT accuracy:  0.9692307692307692


In [66]:
# Feature importances
dtc_model.featureImportances

SparseVector(4, {0: 0.0063, 1: 0.0088, 2: 0.9499, 3: 0.035})

In [67]:
# Feature importances
rfc_model.featureImportances

SparseVector(4, {0: 0.0181, 1: 0.0174, 2: 0.9389, 3: 0.0256})

In [68]:
# Feature importances
gbt_model.featureImportances

SparseVector(4, {0: 0.0517, 1: 0.0321, 2: 0.8696, 3: 0.0466})

#### Another feature of PySpark's MLLib is that the user can see the feature importance of the models for every features as shown above