Merge branch 'master' into discrete

scikit-learn · Dec 6, 2017 · ef57c34 · ef57c34
2 parents 9b3d995 + 62e9bb8
commit ef57c34
Show file tree

Hide file tree

Showing 29 changed files with 840 additions and 165 deletions.
diff --git a/build_tools/circle/list_versions.py b/build_tools/circle/list_versions.py
@@ -5,12 +5,17 @@
 from urllib.request import urlopen
 import json
 import re
+import sys
 
 from distutils.version import LooseVersion
 
 
 def json_urlread(url):
-    return json.loads(urlopen(url).read().decode('utf8'))
+    try:
+        return json.loads(urlopen(url).read().decode('utf8'))
+    except Exception:
+        print('Error reading', url, file=sys.stderr)
+        raise
 
 
 def human_readable_data_quantity(quantity, multiple=1024):

diff --git a/build_tools/travis/test_script.sh b/build_tools/travis/test_script.sh
@@ -43,13 +43,7 @@ run_tests() {
     # Going back to git checkout folder needed to test documentation
     cd $OLDPWD
 
-    # Do not run doctests in scipy-dev-wheels build for now
-    # (broken by numpy 1.14.dev array repr/str formatting
-    # change even with np.set_printoptions(sign='legacy')).
-    # See https://github.com/numpy/numpy/issues/9804 for more details
-    if [[ "$DISTRIB" != "scipy-dev-wheels" ]]; then
-        make test-doc
-    fi
+    make test-doc
 }
 
 if [[ "$RUN_FLAKE8" == "true" ]]; then

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
@@ -1201,6 +1201,7 @@ Model validation
    preprocessing.OneHotEncoder
    preprocessing.CategoricalEncoder
    preprocessing.PolynomialFeatures
+   preprocessing.PowerTransformer
    preprocessing.QuantileTransformer
    preprocessing.RobustScaler
    preprocessing.StandardScaler
@@ -1218,6 +1219,7 @@ Model validation
    preprocessing.quantile_transform
    preprocessing.robust_scale
    preprocessing.scale
+   preprocessing.power_transform
 
 
 .. _random_projection_ref:

diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
@@ -1365,29 +1365,33 @@ implements label ranking average precision (LRAP). This metric is linked to
 the :func:`average_precision_score` function, but is based on the notion of
 label ranking instead of precision and recall.
 
-Label ranking average precision (LRAP) is the average over each ground truth
-label assigned to each sample, of the ratio of true vs. total labels with lower
-score. This metric will yield better scores if you are able to give better rank
-to the labels associated with each sample. The obtained score is always strictly
-greater than 0, and the best value is 1. If there is exactly one relevant
-label per sample, label ranking average precision is equivalent to the `mean
+Label ranking average precision (LRAP) averages over the samples the answer to
+the following question: for each ground truth label, what fraction of
+higher-ranked labels were true labels? This performance measure will be higher
+if you are able to give better rank to the labels associated with each sample.
+The obtained score is always strictly greater than 0, and the best value is 1.
+If there is exactly one relevant label per sample, label ranking average
+precision is equivalent to the `mean
 reciprocal rank <https://en.wikipedia.org/wiki/Mean_reciprocal_rank>`_.
 
 Formally, given a binary indicator matrix of the ground truth labels
-:math:`y \in \mathcal{R}^{n_\text{samples} \times n_\text{labels}}` and the
-score associated with each label
-:math:`\hat{f} \in \mathcal{R}^{n_\text{samples} \times n_\text{labels}}`,
+:math:`y \in \left\{0, 1\right\}^{n_\text{samples} \times n_\text{labels}}`
+and the score associated with each label
+:math:`\hat{f} \in \mathbb{R}^{n_\text{samples} \times n_\text{labels}}`,
 the average precision is defined as
 
 .. math::
   LRAP(y, \hat{f}) = \frac{1}{n_{\text{samples}}}
-    \sum_{i=0}^{n_{\text{samples}} - 1} \frac{1}{|y_i|}
+    \sum_{i=0}^{n_{\text{samples}} - 1} \frac{1}{||y_i||_0}
     \sum_{j:y_{ij} = 1} \frac{|\mathcal{L}_{ij}|}{\text{rank}_{ij}}
 
 
-with :math:`\mathcal{L}_{ij} = \left\{k: y_{ik} = 1, \hat{f}_{ik} \geq \hat{f}_{ij} \right\}`,
-:math:`\text{rank}_{ij} = \left|\left\{k: \hat{f}_{ik} \geq \hat{f}_{ij} \right\}\right|`
-and :math:`|\cdot|` is the l0 norm or the cardinality of the set.
+where
+:math:`\mathcal{L}_{ij} = \left\{k: y_{ik} = 1, \hat{f}_{ik} \geq \hat{f}_{ij} \right\}`,
+:math:`\text{rank}_{ij} = \left|\left\{k: \hat{f}_{ik} \geq \hat{f}_{ij} \right\}\right|`,
+:math:`|\cdot|` computes the cardinality of the set (i.e., the number of
+elements in the set), and :math:`||\cdot||_0` is the :math:`\ell_0` "norm"
+(which computes the number of nonzero elements in a vector).
 
 Here is a small example of usage of this function::
 
@@ -1406,8 +1410,8 @@ Ranking loss
 The :func:`label_ranking_loss` function computes the ranking loss which
 averages over the samples the number of label pairs that are incorrectly
 ordered, i.e. true labels have a lower score than false labels, weighted by
-the inverse number of false and true labels. The lowest achievable
-ranking loss is zero.
+the inverse of the number of ordered pairs of false and true labels.
+The lowest achievable ranking loss is zero.
 
 Formally, given a binary indicator matrix of the ground truth labels
 :math:`y \in \left\{0, 1\right\}^{n_\text{samples} \times n_\text{labels}}` and the
@@ -1417,10 +1421,12 @@ the ranking loss is defined as
 
 .. math::
   \text{ranking\_loss}(y, \hat{f}) =  \frac{1}{n_{\text{samples}}}
-    \sum_{i=0}^{n_{\text{samples}} - 1} \frac{1}{|y_i|(n_\text{labels} - |y_i|)}
-    \left|\left\{(k, l): \hat{f}_{ik} < \hat{f}_{il}, y_{ik} = 1, y_{il} = 0 \right\}\right|
+    \sum_{i=0}^{n_{\text{samples}} - 1} \frac{1}{||y_i||_0(n_\text{labels} - ||y_i||_0)}
+    \left|\left\{(k, l): \hat{f}_{ik} \leq \hat{f}_{il}, y_{ik} = 1, y_{il} = 0 \right\}\right|
 
-where :math:`|\cdot|` is the :math:`\ell_0` norm or the cardinality of the set.
+where :math:`|\cdot|` computes the cardinality of the set (i.e., the number of
+elements in the set) and :math:`||\cdot||_0` is the :math:`\ell_0` "norm"
+(which computes the number of nonzero elements in a vector).
 
 Here is a small example of usage of this function::
 

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
@@ -261,6 +261,9 @@ defined by :math:`phi` followed by removal of the mean in that space.
 Non-linear transformation
 =========================
 
+Mapping to a Uniform distribution
+---------------------------------
+
 Like scalers, :class:`QuantileTransformer` puts all features into the same,
 known range or distribution. However, by performing a rank transformation, it
 smooths out unusual distributions and is less influenced by outliers than
@@ -299,8 +302,53 @@ This can be confirmed on a independent testing set with similar remarks::
   ... # doctest: +ELLIPSIS +SKIP
   array([ 0.01...,  0.25...,  0.46...,  0.60... ,  0.94...])
 
-It is also possible to map the transformed data to a normal distribution by
-setting ``output_distribution='normal'``::
+Mapping to a Gaussian distribution
+----------------------------------
+
+In many modeling scenarios, normality of the features in a dataset is desirable.
+Power transforms are a family of parametric, monotonic transformations that aim
+to map data from any distribution to as close to a Gaussian distribution as
+possible in order to stabilize variance and minimize skewness.
+
+:class:`PowerTransformer` currently provides one such power transformation,
+the Box-Cox transform. The Box-Cox transform is given by:
+
+.. math::
+    y_i^{(\lambda)} =
+    \begin{cases}
+    \dfrac{y_i^\lambda - 1}{\lambda} & \text{if } \lambda \neq 0, \\[8pt]
+    \ln{(y_i)} & \text{if } \lambda = 0,
+    \end{cases}
+
+Box-Cox can only be applied to strictly positive data. The transformation is
+parameterized by :math:`\lambda`, which is determined through maximum likelihood
+estimation. Here is an example of using Box-Cox to map samples drawn from a
+lognormal distribution to a normal distribution::
+
+  >>> pt = preprocessing.PowerTransformer(method='box-cox')
+  >>> X_lognormal = np.random.RandomState(616).lognormal(size=(3, 3))
+  >>> X_lognormal                                         # doctest: +ELLIPSIS
+  array([[ 1.28...,  1.18...,  0.84...],
+         [ 0.94...,  1.60...,  0.38...],
+         [ 1.35...,  0.21...,  1.09...]])
+  >>> pt.fit_transform(X_lognormal)                   # doctest: +ELLIPSIS
+  array([[ 0.49...,  0.17..., -0.15...],
+         [-0.05...,  0.58..., -0.57...],
+         [ 0.69..., -0.84...,  0.10...]])
+
+Below are examples of Box-Cox applied to various probability distributions.
+Note that when applied to certain distributions, Box-Cox achieves very
+Gaussian-like results, but with others, it is ineffective. This highlights
+the importance of visualizing the data before and after transformation.
+
+.. figure:: ../auto_examples/preprocessing/images/sphx_glr_plot_power_transformer_001.png
+   :target: ../auto_examples/preprocessing/plot_power_transformer.html
+   :align: center
+   :scale: 100
+
+It is also possible to map data to a normal distribution using
+:class:`QuantileTransformer` by setting ``output_distribution='normal'``.
+Using the earlier example with the iris dataset::
 
   >>> quantile_transformer = preprocessing.QuantileTransformer(
   ...     output_distribution='normal', random_state=0)

diff --git a/doc/related_projects.rst b/doc/related_projects.rst
@@ -23,6 +23,9 @@ enhance the functionality of scikit-learn's estimators.
 
 - `sklearn_pandas <https://github.com/paulgb/sklearn-pandas/>`_ bridge for
   scikit-learn pipelines and pandas data frame with dedicated transformers.
+
+- `sklearn_xarray <https://github.com/phausamann/sklearn-xarray/>`_ provides
+  compatibility of scikit-learn estimators with xarray data structures.
 
 **Auto-ML**
 

diff --git a/doc/tutorial/basic/tutorial.rst b/doc/tutorial/basic/tutorial.rst
@@ -136,7 +136,7 @@ learn::
     <sphx_glr_auto_examples_classification_plot_digits_classification.py>` illustrates how starting
     from the original problem one can shape the data for consumption in
     scikit-learn.
-    
+
 .. topic:: Loading from external datasets
 
     To load from an external dataset, please refer to :ref:`loading external datasets <external_datasets>`.
@@ -401,4 +401,4 @@ is similarly possible for an instance to be assigned multiple labels::
 In this case, the classifier is fit upon instances each assigned multiple labels.
 The :class:`MultiLabelBinarizer <sklearn.preprocessing.MultiLabelBinarizer>` is
 used to binarize the 2d array of multilabels to ``fit`` upon. As a result,
-``predict()`` returns a 2d array with multiple predicted labels for each instance.
+``predict()`` returns a 2d array with multiple predicted labels for each instance.
diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst
@@ -66,6 +66,13 @@ Preprocessing
   :issue:`10192` by :user:`Henry Lin <hlin117>`, `Hanmin Qin`_
   and `Tom Dupre la Tour`_.
 
+- Added :class:`preprocessing.PowerTransformer`, which implements the Box-Cox
+  power transformation, allowing users to map data from any distribution to a
+  Gaussian distribution. This is useful as a variance-stabilizing transformation
+  in situations where normality and homoscedasticity are desirable.
+  :issue:`10210` by :user:`Eric Chang <ericchang00>` and
+  :user:`Maniteja Nandana <maniteja123>`.
+
 Model evaluation
 
 - Added the :func:`metrics.balanced_accuracy_score` metric and a corresponding
@@ -154,6 +161,13 @@ Classifiers and regressors
   updated estimates for the standard deviation and the coefficients.
   :issue:`10153` by :user:`Jörg Döpfert <jdoepfert>`.
 
+- Fixed a bug when fitting :class:`ensemble.GradientBoostingClassifier` or
+  :class:`ensemble.GradientBoostingRegressor` with ``warm_start=True`` which
+  previously raised a segmentation fault due to a non-conversion of CSC matrix
+  into CSR format expected by ``decision_function``. Similarly, Fortran-ordered
+  arrays are converted to C-ordered arrays in the dense case. :issue:`9991` by
+  :user:`Guillaume Lemaitre <glemaitre>`.
+
 Decomposition, manifold learning and clustering
 
 - Fix for uninformative error in :class:`decomposition.IncrementalPCA`:
@@ -188,6 +202,11 @@ Decomposition, manifold learning and clustering
 - Fixed a bug when setting parameters on meta-estimator, involving both a
   wrapped estimator and its parameter. :issue:`9999` by :user:`Marcus Voss
   <marcus-voss>` and `Joel Nothman`_.
+
+- ``k_means`` now gives a warning, if the number of distinct clusters found
+  is smaller than ``n_clusters``. This may occur when the number of distinct 
+  points in the data set is actually smaller than the number of cluster one is 
+  looking for. :issue:`10059` by :user:`Christian Braune <christianbraune79>`.
 
 - Fixed a bug in :func:`datasets.make_circles`, where no odd number of data
   points could be generated. :issue:`10037` by :user:`Christian Braune
@@ -217,16 +236,14 @@ Feature Extraction
   throw an exception if ``max_patches`` was greater than or equal to the number
   of all possible patches rather than simply returning the number of possible
   patches. :issue:`10100` by :user:`Varun Agrawal <varunagrawal>`
-  
+
 - Fixed a bug in :class:`feature_extraction.text.CountVectorizer`,
   :class:`feature_extraction.text.TfidfVectorizer`,
   :class:`feature_extraction.text.HashingVectorizer` to support 64 bit sparse
   array indexing necessary to process large datasets with more than 2·10⁹ tokens
   (words or n-grams). :issue:`9147` by :user:`Claes-Fredrik Mannby <mannby>`
   and `Roman Yurchak`_.
 
-
-
 API changes summary
 -------------------
 

diff --git a/examples/classification/plot_classifier_comparison.py b/examples/classification/plot_classifier_comparison.py
@@ -97,7 +97,7 @@
     # Plot the training points
     ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
                edgecolors='k')
-    # and testing points
+    # Plot the testing points
     ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6,
                edgecolors='k')
     ax.set_xlim(xx.min(), xx.max())
@@ -123,10 +123,10 @@
         Z = Z.reshape(xx.shape)
         ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)
 
-        # Plot also the training points
+        # Plot the training points
         ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
                    edgecolors='k')
-        # and testing points
+        # Plot the testing points
         ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
                    edgecolors='k', alpha=0.6)