Merge branch 'master' of https://github.com/scikit-learn/scikit-learn …

…into optwhitespace
robertlayton · Apr 20, 2014 · e433b7f · e433b7f
2 parents 47ad7e7 + 9792624
commit e433b7f
Show file tree

Hide file tree

Showing 104 changed files with 8,824 additions and 8,318 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -1,34 +1,18 @@
 language: python
-env:
-    - COVERAGE=--with-coverage
-python:
-    - "2.7"
-    - "2.6"
-    - "3.3"
 virtualenv:
   system_site_packages: true
-before_install:
-    - if [[ $TRAVIS_PYTHON_VERSION != '2.7' ]]; then wget http://repo.continuum.io/miniconda/Miniconda-2.2.2-Linux-x86_64.sh -O miniconda.sh ; fi
-    - if [[ $TRAVIS_PYTHON_VERSION != '2.7' ]]; then chmod +x miniconda.sh ; fi
-    - if [[ $TRAVIS_PYTHON_VERSION != '2.7' ]]; then ./miniconda.sh -b ; fi
-    - if [[ $TRAVIS_PYTHON_VERSION != '2.7' ]]; then export PATH=/home/travis/anaconda/bin:$PATH ; fi
-    - if [[ $TRAVIS_PYTHON_VERSION != '2.7' ]]; then conda update --yes conda ; fi
-    - if [[ $TRAVIS_PYTHON_VERSION != '2.7' ]]; then conda update --yes conda ; fi
-    - if [[ $TRAVIS_PYTHON_VERSION != '2.7' ]]; then conda create -n testenv --yes pip python=$TRAVIS_PYTHON_VERSION ; fi
-    - if [[ $TRAVIS_PYTHON_VERSION != '2.7' ]]; then source activate testenv ; fi
-    - if [[ $TRAVIS_PYTHON_VERSION != '2.7' ]]; then conda install --yes numpy scipy nose ; fi
-    - if [[ $TRAVIS_PYTHON_VERSION == '2.7' ]]; then sudo apt-get update -qq ; fi
-    - if [[ $TRAVIS_PYTHON_VERSION == '2.7' ]]; then sudo apt-get install -qq python-scipy python-nose python-pip ; fi
-install:
-    - python setup.py build_ext --inplace
-    - if [ "${COVERAGE}" == "--with-coverage" ]; then sudo pip install coverage; fi
-    - if [ "${COVERAGE}" == "--with-coverage" ]; then sudo pip install coveralls; fi
-script:
-    - if [ "${COVERAGE}" == "--with-coverage" ]; then
-    -   make test-coverage;
-    - else
-    -   make test;
-    - fi
+env:
+  matrix:
+    - DISTRIB="ubuntu" PYTHON_VERSION="2.7" INSTALL_ATLAS="true"
+      COVERAGE="true"
+    # This environment tests the oldest supported anaconda env
+    - DISTRIB="conda" PYTHON_VERSION="2.6" INSTALL_MKL="false"
+      NUMPY_VERSION="1.6.2" SCIPY_VERSION="0.11.0"
+    # This environment tests the newest supported anaconda env
+    - DISTRIB="conda" PYTHON_VERSION="3.4" INSTALL_MKL="true"
+      NUMPY_VERSION="1.8.1" SCIPY_VERSION="0.13.3"
+install: source continuous_integration/install.sh
+script: bash continuous_integration/test_script.sh
 after_success:
-    - if [ "${COVERAGE}" == "--with-coverage" ]; then coveralls; fi
-
+    - if [[ "$COVERAGE" == "true" ]]; then coveralls; fi
+cache: apt
diff --git a/AUTHORS.rst b/AUTHORS.rst
@@ -89,3 +89,5 @@ People
   * `Arnaud Joly <http://www.ajoly.org>`_
 
   * `Kemal Eren <http://www.kemaleren.com>`_
+
+  * `Michael Becker <http://beckerfuffle.com>`_
diff --git a/Makefile b/Makefile
@@ -34,7 +34,7 @@ test-coverage:
 test: test-code test-doc
 
 trailing-spaces:
-	find sklearn -name "*.py" -exec sed -i'' 's/[ \t]*$$//' {} \;
+	find sklearn -name "*.py" -exec perl -pi -e 's/[ \t]*$$//' {} \;
 
 cython:
 	find sklearn -name "*.pyx" -exec $(CYTHON) {} \;

diff --git a/README.rst b/README.rst
@@ -34,8 +34,10 @@ Important links
 Dependencies
 ============
 
-scikit-learn is tested to work under Python 2.6+ and Python 3.3+
-(using the same codebase thanks to an embedded copy of `six <http://pythonhosted.org/six/>`_).
+scikit-learn is tested to work under Python 2.6, Python 2.7, and Python 3.4.
+(using the same codebase thanks to an embedded copy of
+`six <http://pythonhosted.org/six/>`_). It should also work against Python
+3.3.
 
 The required dependencies to build the software NumPy >= 1.6.1, SciPy >= 0.9
 and a working C/C++ compiler.

diff --git a/continuous_integration/install.sh b/continuous_integration/install.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+# This script is meant to be called by the "install" step defined in
+# .travis.yml. See http://docs.travis-ci.com/ for more details.
+# The behavior of the script is controlled by environment variabled defined
+# in the .travis.yml in the top level folder of the project.
+
+set -e
+
+sudo apt-get update -qq
+if [[ "$INSTALL_ATLAS" == "true" ]]; then
+    sudo apt-get install -qq libatlas3gf-base libatlas-dev
+fi
+
+if [[ "$DISTRIB" == "conda" ]]; then
+    # Deactivate the travis-provided virtual environment and setup a
+    # conda-based environment instead
+    deactivate
+
+    # Use the miniconda installer for faster download / install of conda
+    # itself
+    wget http://repo.continuum.io/miniconda/Miniconda-2.2.2-Linux-x86_64.sh \
+        -O miniconda.sh
+    chmod +x miniconda.sh && ./miniconda.sh -b
+    export PATH=/home/travis/anaconda/bin:$PATH
+    conda update --yes conda
+
+    # Configure the conda environment and put it in the path using the
+    # provided versions
+    conda create -n testenv --yes python=$PYTHON_VERSION pip nose \
+        numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION
+    source activate testenv
+
+    if [[ "$INSTALL_MKL" == "true" ]]; then
+        # Make sure that MKL is used
+        conda install --yes mkl
+    else
+        # Make sure that MKL is not used
+        conda remove --yes --features mkl || echo "MKL not installed"
+    fi
+
+elif [[ "$DISTRIB" == "ubuntu" ]]; then
+    # Use standard ubuntu packages in their default version
+    sudo apt-get install -qq python-scipy python-nose python-pip
+fi
+
+if [[ "$COVERAGE" == "true" ]]; then
+    pip install coverage coveralls
+fi
diff --git a/continuous_integration/test_script.sh b/continuous_integration/test_script.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+# This script is meant to be called by the "script" step defined in
+# .travis.yml. See http://docs.travis-ci.com/ for more details.
+# The behavior of the script is controlled by environment variabled defined
+# in the .travis.yml in the top level folder of the project.
+
+set -e
+
+python --version
+python -c "import numpy; print('numpy %s' % numpy.__version__)"
+python -c "import scipy; print('scipy %s' % scipy.__version__)"
+python setup.py build_ext --inplace
+
+if [[ "$COVERAGE" == "true" ]]; then
+    export WITH_COVERAGE="--with-coverage"
+else
+    export WITH_COVERAGE=""
+fi
+nosetests -s -v $WITH_COVERAGE sklearn
diff --git a/doc/Makefile b/doc/Makefile
@@ -3,7 +3,7 @@
 
 # You can set these variables from the command line.
 SPHINXOPTS    =
-SPHINXBUILD   = sphinx-build
+SPHINXBUILD  ?= sphinx-build
 PAPER         =
 BUILDDIR      = _build
 
@@ -12,7 +12,7 @@ PAPEROPT_a4     = -D latex_paper_size=a4
 PAPEROPT_letter = -D latex_paper_size=letter
 ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 
-.PHONY: help clean html dirhtml pickle json htmlhelp qthelp latex latexpdf changes linkcheck doctest
+.PHONY: help clean html dirhtml pickle json latex latexpdf changes linkcheck doctest
 
 all: html-noplot
 
@@ -22,8 +22,6 @@ help:
 	@echo "  dirhtml   to make HTML files named index.html in directories"
 	@echo "  pickle    to make pickle files"
 	@echo "  json      to make JSON files"
-	@echo "  htmlhelp  to make HTML files and a HTML help project"
-	@echo "  qthelp    to make HTML files and a qthelp project"
 	@echo "  latex     to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 	@echo "  changes   to make an overview of all changed/added/deprecated items"
@@ -65,21 +63,6 @@ json:
 	@echo
 	@echo "Build finished; now you can process the JSON files."
 
-htmlhelp:
-	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
-	@echo
-	@echo "Build finished; now you can run HTML Help Workshop with the" \
-	      ".hhp project file in $(BUILDDIR)/htmlhelp."
-
-qthelp:
-	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
-	@echo
-	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
-	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
-	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/scikit-learn.qhcp"
-	@echo "To view the help file:"
-	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/scikit-learn.qhc"
-
 latex:
 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
 	@echo

diff --git a/doc/developers/index.rst b/doc/developers/index.rst
@@ -647,9 +647,11 @@ To summarize, a `__init__` should look like::
         self.param1 = param1
         self.param2 = param2
 
-There should be no logic, and the parameters should not be changed.
-The corresponding logic should be put where the parameters are used. The
-following is wrong::
+There should be no logic, not even input validation,
+and the parameters should not be changed.
+The corresponding logic should be put where the parameters are used,
+typically in ``fit``.
+The following is wrong::
 
     def __init__(self, param1=1, param2=2, param3=3):
         # WRONG: parameters should not be modified
@@ -660,8 +662,9 @@ following is wrong::
         # the argument in the constructor
         self.param3 = param2
 
-Scikit-learn relies on this mechanism to introspect objects to set
-their parameters by cross-validation.
+The reason for postponing the validation is that the same validation
+would have to be performed in ``set_params``,
+which is used in algorithms like ``GridSearchCV``.
 
 Fitting
 ^^^^^^^

diff --git a/doc/index.rst b/doc/index.rst
@@ -161,7 +161,7 @@
     <strong>Applications</strong>: Visualization, Increased efficiency</br>
     <strong>Algorithms</strong>:&nbsp;
 
-:ref:`PCA<PCA>`, :ref:`Isomap<isomap>`, :ref:`non-negative matrix factorization<NMF>`.
+:ref:`PCA<PCA>`, :ref:`feature selection<feature_selection>`, :ref:`non-negative matrix factorization<NMF>`.
 
 .. raw:: html
 

diff --git a/doc/install.rst b/doc/install.rst
@@ -1,3 +1,5 @@
+.. _installation-instructions:
+
 =========================
 Installing `scikit-learn`
 =========================
@@ -233,22 +235,25 @@ or::
     sudo port install py27-scikit-learn
 
 
-Archlinux
+Arch Linux
 ---------
 
-Archlinux's package is provided at 
-`Arch User Repository (AUR) <https://aur.archlinux.org/>`_ with name
-`python2-scikit-learn` for latest stable version and `python2-scikit-learn-git`
-for building from git version. If `yaourt` is available, it can be installed
-by typing the following command::
+Arch Linux's package is provided through the `official repositories
+<https://www.archlinux.org/packages/?q=scikit-learn>`_ as `python-scikit-learn`
+for Python 3 and `python2-scikit-learn` for Python 2. It can be installed
+by typing the following command:
 
-     sudo yaourt -S python2-scikit-learn
+.. code-block:: none
 
-or::
+     # pacman -S python-scikit-learn
+
+or:
+
+.. code-block:: none
 
-     sudo yaourt -S python2-scikit-learn-git
+     # pacman -S python2-scikit-learn
 
-depending on the version of scikit-learn you want to use.
+depending on the version of Python you use.
 
 
 NetBSD

diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst
@@ -118,15 +118,37 @@ K-means
 
 The :class:`KMeans` algorithm clusters data by trying to separate samples
 in n groups of equal variance, minimizing a criterion known as the
-'inertia' of the groups. This algorithm requires the number of clusters to
-be specified. It scales well to large number of samples and has been used
-across a large range of application areas in many different fields. It is
-also equivalent to the expectation-maximization algorithm when setting the
-covariance matrix to be diagonal, equal and small. The K-means algorithm
-aims to choose centroids :math:`C` that minimise the within cluster sum of
-squares objective function with a dataset :math:`X` with :math:`n` samples:
-
-.. math:: J(X, C) = \sum_{i=0}^{n}\min_{\mu_j \in C}(||x_j - \mu_i||^2)
+`inertia<inertia>` or within-cluster sum-of-squares.
+This algorithm requires the number of clusters to be specified.
+It scales well to large number of samples and has been used
+across a large range of application areas in many different fields.
+
+The k-means algorithm divides a set of :math:`N` samples :math:`X`:
+into :math:`K` disjoint clusters :math:`C`,
+each described by the mean :math:`\mu_j` of the samples in the cluster.
+The means are commonly called the cluster "centroids";
+note that they are not, in general, points from :math:`X`,
+although they live in the same space.
+The K-means algorithm aims to choose centroids
+that minimise the *inertia*, or within-cluster sum of squared criterion:
+
+.. math:: \sum_{i=0}^{n}\min_{\mu_j \in C}(||x_j - \mu_i||^2)
+
+Inertia, or the within-cluster sum of squares criterion,
+can be recognized as a measure of how internally coherent clusters are.
+It suffers from various drawbacks:
+
+- Inertia makes the assumption that clusters are convex and isotropic,
+  which is not always the case. It responds poorly to elongated clusters,
+  or manifolds with irregular shapes.
+
+- Inertia is not a normalized metric: we just know that lower values are
+  better and zero is optimal. But in very high-dimensional spaces, Euclidean
+  distances tend to become inflated
+  (this is an instance of the so-called "curse of dimensionality").
+  Running a dimensionality reduction algorithm such as `PCA<PCA>`
+  prior to k-means clustering can alleviate this problem
+  and speed up the computations.
 
 K-means is often referred to as Lloyd's algorithm. In basic terms, the
 algorithm has three steps. The first step chooses the initial centroids, with
@@ -144,7 +166,10 @@ until the centroids do not move significantly.
    :align: right
    :scale: 35
 
-The algorithm can be understood through the concept of `Voronoi diagrams
+K-means is equivalent to the expectation-maximization algorithm
+with a small, all-equal, diagonal covariance matrix.
+
+The algorithm can also be understood through the concept of `Voronoi diagrams
 <https://en.wikipedia.org/wiki/Voronoi_diagram>`_. First the Voronoi diagram of
 the points is calculated using the current centroids. Each segment in the
 Voronoi diagram becomes a separate cluster. Secondly, the centroids are updated
@@ -753,33 +778,6 @@ classes according to some similarity metric.
 
 .. currentmodule:: sklearn.metrics
 
-Inertia
--------
-
-Presentation and usage
-~~~~~~~~~~~~~~~~~~~~~~
-
-TODO: factorize inertia computation out of kmeans and then write me!
-
-
-Advantages
-~~~~~~~~~~
-
-- No need for the ground truth knowledge of the "real" classes.
-
-Drawbacks
-~~~~~~~~~
-
-- Inertia makes the assumption that clusters are convex and isotropic
-  which is not always the case especially of the clusters are manifolds
-  with weird shapes: for instance inertia is a useless metrics to evaluate
-  clustering algorithm that tries to identify nested circles on a 2D plane.
-
-- Inertia is not a normalized metrics: we just know that lower values are
-  better and bounded by zero. One potential solution would be to adjust
-  inertia for random clustering (assuming the number of ground truth classes
-  is known).
-
 
 Adjusted Rand index
 -------------------

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
@@ -268,11 +268,11 @@ They also tend to break when the problem is badly conditioned
 
 Elastic Net
 ===========
-:class:`ElasticNet` is a linear model trained with L1 and L2 prior as
-regularizer. This combination allows for learning a sparse model where
+:class:`ElasticNet` is a linear regression model trained with L1 and L2 prior
+as regularizer. This combination allows for learning a sparse model where
 few of the weights are non-zero like :class:`Lasso`, while still maintaining
-the regularization properties of :class:`Ridge`. We control this tradeoff
-using the `l1_ratio` parameter.
+the regularization properties of :class:`Ridge`. We control the convex
+combination of L1 and L2 using the `l1_ratio` parameter.
 
 Elastic-net is useful when there are multiple features which are
 correlated with one another. Lasso is likely to pick one of these