In [2]:
import findspark

In [3]:
findspark.init('/home/pushya/spark-2.1.0-bin-hadoop2.7')

In [4]:
import pyspark

In [5]:
from pyspark.sql import SparkSession

In [6]:
spark = SparkSession.builder.appName('mytree').getOrCreate()

In [7]:
from pyspark.ml import pipeline

In [8]:
from pyspark.ml.classification import (DecisionTreeClassifier,RandomForestClassifier,GBTClassifier)

In [9]:
data = spark.read.format('libsvm').load('sample_libsvm_data.txt')

In [10]:
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



In [11]:
train_data,test_data = data.randomSplit([0.7,0.3])

In [15]:
dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier(numTrees=20)
gbt = GBTClassifier()

In [16]:
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)

In [17]:
dtc_predt= dtc_model.transform(test_data)
rfc_predt= rfc_model.transform(test_data)
gbt_predt= gbt_model.transform(test_data)

In [18]:
dtc_predt.show()

+-----+--------------------+-------------+-----------+----------+
|label|            features|rawPrediction|probability|prediction|
+-----+--------------------+-------------+-----------+----------+
|  0.0|(692,[95,96,97,12...|   [27.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[98,99,100,1...|    [0.0,1.0]|  [0.0,1.0]|       1.0|
|  0.0|(692,[123,124,125...|   [27.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[123,124,125...|   [27.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|   [27.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|   [27.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[125,126,127...|   [27.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [27.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [27.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[150,151,152...|   [27.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[152,153,154...|   [27.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[153,154,155...|   [27.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(69

In [19]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [20]:
acc_eval = MulticlassClassificationEvaluator(metricName='accuracy')

In [22]:
print ('DTC Accuracy:')
acc_eval.evaluate(dtc_predt)

DTC Accuracy:


0.9655172413793104

In [23]:
print ('GBT Accuracy:')
acc_eval.evaluate(gbt_predt)

DTC Accuracy:


0.9655172413793104

In [24]:
print ('RFC Accuracy:')
acc_eval.evaluate(rfc_predt)

RFC Accuracy:


1.0

In [26]:
rfc_model.featureImportances

SparseVector(692, {100: 0.0047, 159: 0.0027, 258: 0.0033, 263: 0.0085, 272: 0.0368, 295: 0.036, 302: 0.0427, 317: 0.0469, 324: 0.0026, 350: 0.0473, 351: 0.0441, 353: 0.0053, 357: 0.0371, 358: 0.0081, 370: 0.0063, 374: 0.0027, 384: 0.0035, 385: 0.0379, 405: 0.0864, 410: 0.0036, 411: 0.0076, 416: 0.0031, 432: 0.0025, 434: 0.0528, 463: 0.0355, 469: 0.0435, 483: 0.05, 489: 0.0503, 496: 0.0469, 498: 0.0059, 516: 0.0094, 517: 0.0629, 523: 0.0037, 524: 0.0056, 525: 0.0235, 539: 0.0465, 545: 0.0177, 550: 0.0063, 568: 0.0484, 569: 0.0005, 571: 0.0029, 573: 0.0031, 632: 0.0051})