Skip to content

Commit

Permalink
[SPARK-13787][ML][PYSPARK] Pyspark feature importances for decision t…
Browse files Browse the repository at this point in the history
…ree and random forest

## What changes were proposed in this pull request?

This patch adds a `featureImportance` property to the Pyspark API for `DecisionTreeRegressionModel`, `DecisionTreeClassificationModel`, `RandomForestRegressionModel` and `RandomForestClassificationModel`.

## How was this patch tested?

Python doc tests for the affected classes were updated to check feature importances.

Author: sethah <seth.hendrickson16@gmail.com>

Closes apache#11622 from sethah/SPARK-13787.
  • Loading branch information
sethah authored and roygao94 committed Mar 22, 2016
1 parent 9a40f72 commit 7f79d40
Show file tree
Hide file tree
Showing 2 changed files with 88 additions and 0 deletions.
44 changes: 44 additions & 0 deletions python/pyspark/ml/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,8 @@ class DecisionTreeClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred
3
>>> model.depth
1
>>> model.featureImportances
SparseVector(1, {0: 1.0})
>>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"])
>>> result = model.transform(test0).head()
>>> result.prediction
Expand Down Expand Up @@ -352,6 +354,27 @@ class DecisionTreeClassificationModel(DecisionTreeModel):
.. versionadded:: 1.4.0
"""

@property
@since("2.0.0")
def featureImportances(self):
"""
Estimate of the importance of each feature.
This generalizes the idea of "Gini" importance to other losses,
following the explanation of Gini importance from "Random Forests" documentation
by Leo Breiman and Adele Cutler, and following the implementation from scikit-learn.
This feature importance is calculated as follows:
- importance(feature j) = sum (over nodes which split on feature j) of the gain,
where gain is scaled by the number of instances passing through node
- Normalize importances for tree to sum to 1.
Note: Feature importance for single decision trees can have high variance due to
correlated predictor variables. Consider using a :class:`RandomForestClassifier`
to determine feature importance instead.
"""
return self._call_java("featureImportances")


@inherit_doc
class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasSeed,
Expand All @@ -375,6 +398,8 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred
>>> td = si_model.transform(df)
>>> rf = RandomForestClassifier(numTrees=3, maxDepth=2, labelCol="indexed", seed=42)
>>> model = rf.fit(td)
>>> model.featureImportances
SparseVector(1, {0: 1.0})
>>> allclose(model.treeWeights, [1.0, 1.0, 1.0])
True
>>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"])
Expand Down Expand Up @@ -443,6 +468,25 @@ class RandomForestClassificationModel(TreeEnsembleModels):
.. versionadded:: 1.4.0
"""

@property
@since("2.0.0")
def featureImportances(self):
"""
Estimate of the importance of each feature.
This generalizes the idea of "Gini" importance to other losses,
following the explanation of Gini importance from "Random Forests" documentation
by Leo Breiman and Adele Cutler, and following the implementation from scikit-learn.
This feature importance is calculated as follows:
- Average over trees:
- importance(feature j) = sum (over nodes which split on feature j) of the gain,
where gain is scaled by the number of instances passing through node
- Normalize importances for tree to sum to 1.
- Normalize feature importance vector to sum to 1.
"""
return self._call_java("featureImportances")


@inherit_doc
class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter,
Expand Down
44 changes: 44 additions & 0 deletions python/pyspark/ml/regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,8 @@ class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi
1
>>> model.numNodes
3
>>> model.featureImportances
SparseVector(1, {0: 1.0})
>>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"])
>>> model.transform(test0).head().prediction
0.0
Expand Down Expand Up @@ -499,6 +501,27 @@ class DecisionTreeRegressionModel(DecisionTreeModel):
.. versionadded:: 1.4.0
"""

@property
@since("2.0.0")
def featureImportances(self):
"""
Estimate of the importance of each feature.
This generalizes the idea of "Gini" importance to other losses,
following the explanation of Gini importance from "Random Forests" documentation
by Leo Breiman and Adele Cutler, and following the implementation from scikit-learn.
This feature importance is calculated as follows:
- importance(feature j) = sum (over nodes which split on feature j) of the gain,
where gain is scaled by the number of instances passing through node
- Normalize importances for tree to sum to 1.
Note: Feature importance for single decision trees can have high variance due to
correlated predictor variables. Consider using a :class:`RandomForestRegressor`
to determine feature importance instead.
"""
return self._call_java("featureImportances")


@inherit_doc
class RandomForestRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasSeed,
Expand All @@ -515,6 +538,8 @@ class RandomForestRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi
... (0.0, Vectors.sparse(1, [], []))], ["label", "features"])
>>> rf = RandomForestRegressor(numTrees=2, maxDepth=2, seed=42)
>>> model = rf.fit(df)
>>> model.featureImportances
SparseVector(1, {0: 1.0})
>>> allclose(model.treeWeights, [1.0, 1.0])
True
>>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"])
Expand Down Expand Up @@ -579,6 +604,25 @@ class RandomForestRegressionModel(TreeEnsembleModels):
.. versionadded:: 1.4.0
"""

@property
@since("2.0.0")
def featureImportances(self):
"""
Estimate of the importance of each feature.
This generalizes the idea of "Gini" importance to other losses,
following the explanation of Gini importance from "Random Forests" documentation
by Leo Breiman and Adele Cutler, and following the implementation from scikit-learn.
This feature importance is calculated as follows:
- Average over trees:
- importance(feature j) = sum (over nodes which split on feature j) of the gain,
where gain is scaled by the number of instances passing through node
- Normalize importances for tree to sum to 1.
- Normalize feature importance vector to sum to 1.
"""
return self._call_java("featureImportances")


@inherit_doc
class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter,
Expand Down

0 comments on commit 7f79d40

Please sign in to comment.