In [2]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("A").config("B", "C").getOrCreate()
sc=spark.sparkContext

In [3]:
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.util import MLUtils

# Load and parse the data file into an RDD of LabeledPoint.
data = MLUtils.loadLibSVMFile(sc, './sample_libsvm_data.txt')
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

In [4]:
# Train a DecisionTree model.
# Empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
                                     impurity='gini', maxDepth=5, maxBins=32)

#trainingdata：训练用的数据，nmClasses：响应变量有几种取值， categoricalFeaturesInfo={}：将自变量中属于categorical（定性）的分量放在花括号中
#impurity：用决策树分出来的叶子节点的杂质度（希望每一个叶子节点只有0或1，是纯洁的，纯度越高越好）
# impurity = 'entropy' information gain，熵、信息增益
# impurity = 'gini' gini index
#上面是两种描述杂质度的公式
#maxDepth：决策树的深度。分得越细，杂质度越低，拟合得越好，但是有过拟合的风险。应该对庞大的决策树进行剪枝。
#maxBins：解释变量中的数值型变量在本模型中是非常规的，应该对它进行离散化（区间化），本参数指定数值型变量分类的最大值。


In [5]:
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))  #解释变量
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(
    lambda lp: lp[0] != lp[1]).count() / float(testData.count())
#lp[0]是label，lp[1]是predictions，用filter保留不一样的元素，计数，再除以总数
print('Test Error = ' + str(testErr))
print('Learned classification tree model:')
print(model.toDebugString())  #可以返回拟合出来的决策树的形状

Test Error = 0.037037037037037035
Learned classification tree model:
DecisionTreeModel classifier of depth 2 with 5 nodes
  If (feature 406 <= 22.0)
   If (feature 100 <= 193.5)
    Predict: 0.0
   Else (feature 100 > 193.5)
    Predict: 1.0
  Else (feature 406 > 22.0)
   Predict: 1.0



读ppt的P4~20的例子，讲解展成决策树的方法