scikit-learn · ogrisel · Nov 21, 2014 · Apr 3, 2014 · Sep 30, 2014 · Sep 30, 2014
diff --git a/benchmarks/bench_20newsgroups.py b/benchmarks/bench_20newsgroups.py
@@ -0,0 +1,97 @@
+from __future__ import print_function, division
+from time import time
+import argparse
+import numpy as np
+
+from sklearn.dummy import DummyClassifier
+
+from sklearn.datasets import fetch_20newsgroups_vectorized
+from sklearn.metrics import accuracy_score
+from sklearn.utils.validation import check_array
+
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.ensemble import ExtraTreesClassifier
+from sklearn.ensemble import AdaBoostClassifier
+from sklearn.linear_model import LogisticRegression
+from sklearn.naive_bayes import MultinomialNB
+
+ESTIMATORS = {
+    "dummy": DummyClassifier(),
+    "random_forest": RandomForestClassifier(n_estimators=100,
+                                            max_features="sqrt",
+                                            min_samples_split=10),
+    "extra_trees": ExtraTreesClassifier(n_estimators=100,
+                                        max_features="sqrt",
+                                        min_samples_split=10),
+    "logistic_regression": LogisticRegression(),
+    "naive_bayes": MultinomialNB(),
+    "adaboost": AdaBoostClassifier(n_estimators=10),
+}
+
+
+###############################################################################
+# Data
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-e', '--estimators', nargs="+", required=True,
+                        choices=ESTIMATORS)
+    args = vars(parser.parse_args())
+
+    data_train = fetch_20newsgroups_vectorized(subset="train")
+    data_test = fetch_20newsgroups_vectorized(subset="test")
+    X_train = check_array(data_train.data, dtype=np.float32,
+                          accept_sparse="csc")
+    X_test = check_array(data_test.data, dtype=np.float32, accept_sparse="csr")
+    y_train = data_train.target
+    y_test = data_test.target
+
+    print("20 newsgroups")
+    print("=============")
+    print("X_train.shape = {0}".format(X_train.shape))
+    print("X_train.format = {0}".format(X_train.format))
+    print("X_train.dtype = {0}".format(X_train.dtype))
+    print("X_train density = {0}"
+          "".format(X_train.nnz / np.product(X_train.shape)))
+    print("y_train {0}".format(y_train.shape))
+    print("X_test {0}".format(X_test.shape))
+    print("X_test.format = {0}".format(X_test.format))
+    print("X_test.dtype = {0}".format(X_test.dtype))
+    print("y_test {0}".format(y_test.shape))
+    print()
+
+    print("Classifier Training")
+    print("===================")
+    accuracy, train_time, test_time = {}, {}, {}
+    for name in sorted(args["estimators"]):
+        clf = ESTIMATORS[name]
+        try:
+            clf.set_params(random_state=0)
+        except (TypeError, ValueError):
+            pass
+
+        print("Training %s ... " % name, end="")
+        t0 = time()
+        clf.fit(X_train, y_train)
+        train_time[name] = time() - t0
+        t0 = time()
+        y_pred = clf.predict(X_test)
+        test_time[name] = time() - t0
+        accuracy[name] = accuracy_score(y_test, y_pred)
+        print("done")
+
+    print()
+    print("Classification performance:")
+    print("===========================")
+    print()
+    print("%s %s %s %s" % ("Classifier  ", "train-time", "test-time",
+                           "Accuracy"))
+    print("-" * 44)
+    for name in sorted(accuracy, key=accuracy.get):
+        print("%s %s %s %s" % (name.ljust(16),
+                               ("%.4fs" % train_time[name]).center(10),
+                               ("%.4fs" % test_time[name]).center(10),
+                               ("%.4f" % accuracy[name]).center(10)))
+
+    print()
diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst
@@ -108,7 +108,7 @@ construction.  The prediction of the ensemble is given as the averaged
 prediction of the individual classifiers.
 
 As other classifiers, forest classifiers have to be fitted with two
-arrays: an array X of size ``[n_samples, n_features]`` holding the
+arrays: a sparse or dense array X of size ``[n_samples, n_features]`` holding the
 training samples, and an array Y of size ``[n_samples]`` holding the
 target values (class labels) for the training samples::
 

diff --git a/doc/modules/tree.rst b/doc/modules/tree.rst
@@ -90,10 +90,10 @@ Classification
 :class:`DecisionTreeClassifier` is a class capable of performing multi-class
 classification on a dataset.
 
-As other classifiers, :class:`DecisionTreeClassifier` take as input two
-arrays: an array X of size ``[n_samples, n_features]`` holding the training
-samples, and an array Y of integer values, size ``[n_samples]``, holding
-the class labels for the training samples::
+As other classifiers, :class:`DecisionTreeClassifier` take as input two arrays:
+an array X, sparse or dense, of size ``[n_samples, n_features]``  holding the
+training samples, and an array Y of integer values, size ``[n_samples]``,
+holding the class labels for the training samples::
 
     >>> from sklearn import tree
     >>> X = [[0, 0], [1, 1]]
@@ -157,7 +157,7 @@ a PDF file (or any other supported file type) directly in Python::
 
 After being fitted, the model can then be used to predict new values::
 
-    >>> clf.predict(iris.data[0, :])
+    >>> clf.predict(iris.data[:1, :])
     array([0])
 
 .. figure:: ../auto_examples/tree/images/plot_iris_001.png
@@ -195,7 +195,6 @@ instead of integer values::
     >>> clf.predict([[1, 1]])
     array([ 0.5])
 
-
 .. topic:: Examples:
 
  * :ref:`example_tree_plot_tree_regression.py`
@@ -337,6 +336,13 @@ Tips on practical use
   * All decision trees use ``np.float32`` arrays internally.
     If training data is not in this format, a copy of the dataset will be made.
 
+  * If the input matrix X is very sparse, it is recommended to convert to sparse
+    ``csc_matrix` before calling fit and sparse ``csr_matrix`` before calling
+    predict. Training time can be orders of magnitude faster for a sparse
+    matrix input compared to a dense matrix when features have zero values in
+    most of the samples.
+
+
 
 .. _tree_algorithms: