Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/master'
Browse files Browse the repository at this point in the history
Conflicts:
	sklearn/metrics/pairwise.py
  • Loading branch information
duckworthd committed Jun 1, 2012
2 parents 9a566bb + 8e8d476 commit faa53bd
Show file tree
Hide file tree
Showing 67 changed files with 2,958 additions and 1,108 deletions.
31 changes: 29 additions & 2 deletions .mailmap
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
Gael Varoquaux <gael.varoquaux@normalesup.org> gvaroquaux <gael.varoquaux@normalesup.org>
Gael Varoquaux <gael.varoquaux@normalesup.org> Gael varoquaux <gael.varoquaux@normalesup.org>
Gael Varoquaux <gael.varoquaux@normalesup.org> GaelVaroquaux <gael.varoquaux@normalesup.org>
Gael Varoquaux <gael.varoquaux@normalesup.org> Varoquaux <varoquau@normalesup.org>
Olivier Grisel <olivier.grisel@ensta.org> ogrisel <olivier.grisel@ensta.org>
Olivier Grisel <olivier.grisel@ensta.org> Olivier Grisel <ogrisel@turingcarpet.(none)>
Alexandre Gramfort <alexandre.gramfort@inria.fr> Alexandre Gramfort <alexandre.gramfort@gmail.com>
Alexandre Gramfort <alexandre.gramfort@inria.fr> Alexandre Gramfort <alexandre.gramfort@m4x.org>
Alexandre Gramfort <alexandre.gramfort@inria.fr> Alexandre Gramfort <gramfort@localhost.(none)>
Matthieu Perrot <matthieu.perrot@cea.fr> Matthieu Perrot <revilyo@earth.(none)>
Matthieu Perrot <matthieu.perrot@cea.fr> revilyo <revilyo@earth.(none)>
Vincent Michel <vincent.michel@inria.fr> vincent <vincent@vincent.org>
Expand All @@ -12,6 +16,7 @@ Vincent Michel <vincent.michel@inria.fr> Vincent M <vm.michel@gmail.com>
Vincent Michel <vincent.michel@inria.fr> Vincent Michel <vincent.michel@logilab.fr>
Vincent Michel <vincent.michel@inria.fr> Vincent M <vincent.michel@logilab.fr>
Vincent Michel <vincent.michel@inria.fr> Vincent michel <vmic@crater2.logilab.fr>
Vincent Michel <vincent.michel@inria.fr> Vincent Michel <vm.michel@gmail.com>
Ariel Rokem <arokem@berkeley.edu> arokem <arokem@berkeley.edu>
Bertrand Thirion <bertrand.thirion@inria.fr> bthirion <bertrand.thirion@inria.fr>
Peter Prettenhofer <peter.prettenhofer@gmail.com> pprett <peter.prettenhofer@gmail.com>
Expand All @@ -23,19 +28,41 @@ James Bergstra <james.bergstra@gmail.com> james.bergstra <james.bergstra@gmail.c
Xinfan Meng <mxf3306@gmail.com> mxf <mxf@chomsky.localdomain>
Jan Schlüter <scikit-learn@jan-schlueter.de> f0k <scikit-learn@jan-schlueter.de>
Vlad Niculae <vlad@vene.ro> vene <vlad@vene.ro>
Andreas Müller <amueller@ais.uni-bonn.de> amueller <amueller@ais.uni-bonn.de>
Virgile Fritsch <virgile.fritsch@gmail.com> VirgileFritsch <virgile.fritsch@gmail.com>
Virgile Fritsch <virgile.fritsch@gmail.com> Virgile <virgile.fritsch@gmail.com>
Virgile Fritsch <virgile.fritsch@gmail.com> Virgile <virgile@virgile-Precision-M4400.(none)>
Jean Kossaifi <jean.kossaifi@gmail.com> Jean KOSSAIFI <jkossaifi@is208616.intra.cea.fr>
Jean Kossaifi <jean.kossaifi@gmail.com> JeanKossaifi <jean.kossaifi@gmail.com>
Jake Vanderplas <vanderplas@astro.washington.edu> Jacob Vanderplas <jakevdp@yahoo.com>
Jean Kossaifi <jean.kossaifi@gmail.com> Jean Kossaifi <kossaifi@is208616.intra.cea.fr>
Jake VanderPlas <vanderplas@astro.washington.edu> Jacob Vanderplas <jakevdp@yahoo.com>
Jake VanderPlas <vanderplas@astro.washington.edu> Jake Vanderplas <jakevdp@yahoo.com>
Jake VanderPlas <vanderplas@astro.washington.edu> Jake Vanderplas <vanderplas@astro.washington.edu>
Andreas Mueller <amueller@ais.uni-bonn.de> Andy <amueller@ais.uni-bonn.de>
Andreas Mueller <amueller@ais.uni-bonn.de> andy <andy@marvin>
Andreas Mueller <amueller@ais.uni-bonn.de> Andreas Mueller <amueller@templateimage.ista.local>
Andreas Mueller <amueller@ais.uni-bonn.de> Andreas Müller <amueller@ais.uni-bonn.de>
Brian Holt <bh00038@cvplws63.eps.surrey.ac.uk> bdholt1 <bdholt1@gmail.com>
Brian Holt <bh00038@cvplws63.eps.surrey.ac.uk> Brian Holt <bdholt1@gmail.com>
Robert Layton <robertlayton@gmail.com> robertlayton <robertlayton@gmail.com>
Robert Layton <robertlayton@gmail.com> = <robertlayton@gmail.com>
Fabian Pedregosa <fabian@fseoane.net> Fabian Pedregosa <fabian.pedregosa@inria.fr>
Lars Buitinck <L.J.Buitinck@uva.nl> Lars Buitinck <larsmans@gmail.com>
Lars Buitinck <L.J.Buitinck@uva.nl> unknown <Lars@.(none)>
Lars Buitinck <L.J.Buitinck@uva.nl> Lars Buitinck <l.j.buitinck@uva.nl>
DraXus <draxus@gmail.com> draxus <draxus@hammer.ugr>
Edouard DUCHESNAY <ed203246@is206877.intra.cea.fr> Edouard Duchesnay <duchesnay@is143433.(none)>
Edouard DUCHESNAY <ed203246@is206877.intra.cea.fr> Edouard Duchesnay <edouard.duchesnay@gmail.com>
Edouard DUCHESNAY <ed203246@is206877.intra.cea.fr> duchesnay <edouard.duchesnay@gmail.com>
Edouard DUCHESNAY <ed203246@is206877.intra.cea.fr> duchesnay <edouard@is2206219.(none)>
Emmanuelle Gouillart <emmanuelle.gouillart@nsup.org> Emmanuelle Gouillart <emma@aleph.(none)>
Emmanuelle Gouillart <emmanuelle.gouillart@nsup.org> emmanuelle <emmanuelle.gouillart@nsup.org>
Gilles Louppe <g.louppe@gmail.com> Gilles Louppe <g.louppe@ulg.ac.be>
Nelle Varoquaux <nelle.varoquaux@gmail.com> Nelle Varoquaux <nelle@phgroup.com>
Nicolas Pinto <pinto@alum.mit.edu> Nicolas Pinto <pinto@mit.edu>
Olivier Hervieu <olivier.hervieu@gmail.com> Olivier Hervieu <olivier.hervieu@tinyclues.com>
Satrajit Ghosh <satra@mit.edu> Satrajit Ghosh <satrajit.ghosh@gmail.com>
Shiqiao Du <lucidfrontier.45@gmail.com> Shiqiao Du <s.du@freebit.net>
Shiqiao Du <lucidfrontier.45@gmail.com> Shiqiao <lucidfrontier.45@gmail.com>
Tim Sheerman-Chase <t.sheerman-chase@surrey.ac.uk> Tim Sheerman-Chase <ts00051@ts00051-desktop.(none)>
Vincent Schut <schut@sarvision.nl> Vincent Schut <vincent@TIMO.(none)>
iBayer <mane.desk@gmail.com> ibayer <mane.desk@gmail.com>
30 changes: 12 additions & 18 deletions doc/developers/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -75,29 +75,24 @@ repository <http://github.com/scikit-learn/scikit-learn/>`__ on GitHub:

$ git clone git@github.com:YourLogin/scikit-learn.git

4. Work on this copy, on your computer, using Git to do the version
control::
4. Create a branch to hold your changes::

$ git add modified_files
$ git commit
$ git push origin master

and so on.
$ git checkout -b my-feature

If your changes are not just trivial fixes, it is better to directly
work in a branch with the name of the feature you are working on. In
this case, replace step 4 with step 5:
and start making changes. Never work in the ``master`` branch!

5. Create a branch to host your changes and publish it on your public
repo::
5. Work on this copy, on your computer, using Git to do the version
control. When you're done editing, do::

$ git checkout -b my-feature
$ git add modified_files
$ git commit
$ git push origin my-feature

When you are ready, and you have pushed your changes to your GitHub repo, go
the web page of the repo, and click on 'Pull request' to send us a pull
to record your changes in Git, then push them to GitHub with::

$ git push -u origin my-feature

Finally, go to the web page of the your fork of the scikit-learn repo,
and click 'Pull request' to send your changes to the maintainers for review.
request. This will send an email to the committers, but might also send an
email to the mailing list in order to get more visibility.

Expand All @@ -109,8 +104,7 @@ email to the mailing list in order to get more visibility.
to use instead of ``origin``. If we choose the name ``upstream`` for it, the
command will be::

$ git remote add upstream git@github.com:scikit-learn/scikit-learn.git

$ git remote add upstream https://github.com/scikit-learn/scikit-learn.git

(If any of the above seems like magic to you, then look up the
`Git documentation <http://git-scm.com/documentation>`_ on the web.)
Expand Down
15 changes: 14 additions & 1 deletion doc/modules/clustering.rst
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,19 @@ function of the gradient of the image.
the spectral clustering solver chooses an arbitrary one, putting
the first sample alone in one bin.

.. warning:: Transforming distance to well-behaved similarities

Note that if the values of your similarity matrix are not well
distributed, e.g. with negative values or with a distance matrix
rather than a similarity, the spectral problem will be singular and
the problem not solvable. In which case it is advised to apply a
transformation to the entries of the matrix. For instance, in the
case of a signed distance matrix, is common to apply a heat kernel::

similarity = np.exp(-beta * distance / distance.std())

See the examples for such an application.

.. topic:: Examples:

* :ref:`example_cluster_plot_segmentation_toy.py`: Segmenting objects
Expand Down Expand Up @@ -894,7 +907,7 @@ cluster analysis.

>>> import numpy as np
>>> from sklearn.cluster import KMeans
>>> kmeans_model = KMeans(k=3, random_state=1).fit(X)
>>> kmeans_model = KMeans(n_clusters=3, random_state=1).fit(X)
>>> labels = kmeans_model.labels_
>>> metrics.silhouette_score(X, labels, metric='euclidean')
... # doctest: +ELLIPSIS
Expand Down
2 changes: 1 addition & 1 deletion doc/modules/linear_model.rst
Original file line number Diff line number Diff line change
Expand Up @@ -392,7 +392,7 @@ orthogonal matching pursuit can approximate the optimum solution vector with a
fixed number of non-zero elements:

.. math:: \text{arg\,min\,} ||y - X\gamma||_2^2 \text{ subject to } \
||\gamma||_0 \leq n_{nonzero_coefs}
||\gamma||_0 \leq n_{nonzero\_coefs}

Alternatively, orthogonal matching pursuit can target a specific error instead
of a specific number of non-zero coefficients. This can be expressed as:
Expand Down
3 changes: 3 additions & 0 deletions doc/modules/sgd.rst
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,9 @@ array of shape [n_classes, n_features] and `intercept_` is a one
dimensional array of shape [n_classes]. The i-th row of `coef_` holds
the weight vector of the OVA classifier for the i-th class; classes are
indexed in ascending order (see attribute `classes`).
Note that, in principle, since they allow to create a probability model,
`loss="log"` and `loss="modified_huber"` are more suitable for
one-vs-all classification.

:class:`SGDClassifier` supports both weighted classes and weighted
instances via the fit parameters `class_weight` and `sample_weight`. See
Expand Down
7 changes: 3 additions & 4 deletions doc/modules/svm.rst
Original file line number Diff line number Diff line change
Expand Up @@ -352,10 +352,9 @@ Tips on Practical Use
set `cache_size` to a higher value than the default of 200(MB),
such as 500(MB) or 1000(MB).

* **Setting C**: In constrast to the scaling in LibSVM and LibLinear,
the ``C`` parameter in `sklearn.svm` is a per sample penalty.
Commonly good values for ``C`` often are very large (i.e. ``10**4``)
and seldom below ``1``.
* **Setting C**: C is ``1`` by default and it's a reasonable default choice.
If you have a lot of noisy observations you should decrease it.
It corresponds to regularize more the estimation.

* Support Vector Machine algorithms are not scale invariant, so **it
is highly recommended to scale your data**. For example, scale each
Expand Down
2 changes: 1 addition & 1 deletion doc/themes/scikit-learn/layout.html
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ <h3>{{ _('This page') }}</h3>
{% trans last_updated=last_updated|e %}Last updated on {{ last_updated }}.{% endtrans %}
{%- endif %}
{%- if show_sphinx %}
{% trans sphinx_version=sphinx_version|e %}Created using <a href="http://sphinx.pocoo.org/">Sphinx</a> {{ sphinx_version }}{% endtrans %}. Design by <a href="http://webylimonada.com">Web y Limonada</a>.
{% trans sphinx_version=sphinx_version|e %}Created using <a href="http://sphinx.pocoo.org/">Sphinx</a> {{ sphinx_version }}{% endtrans %}. Design by <a href="http://desgrana.es">Desgrana</a>.
{%- endif %}
{%- if show_source and has_source and sourcename %}
<span style="padding-left: 5ex;">
Expand Down
12 changes: 12 additions & 0 deletions doc/tutorial/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,15 @@ Tutorials: From the bottom up with scikit-learn
.. note:: **Videos**

Videos with tutorials can also be found in the :ref:`videos` section.

.. note:: **Doctest Mode**

The code-examples in the above tutorials are written in a
*python-console* format. If you wish to easily execute these examples
in **iPython**, use::
%doctest_mode

in the iPython-console. You can then simply copy and paste the examples
directly into iPython without having to worry about removing the **>>>**
manually.
8 changes: 4 additions & 4 deletions doc/tutorial/statistical_inference/unsupervised_learning.rst
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@ algorithms. The simplest clustering algorithm is the
>>> X_iris = iris.data
>>> y_iris = iris.target

>>> k_means = cluster.KMeans(k=3)
>>> k_means = cluster.KMeans(n_clusters=3)
>>> k_means.fit(X_iris) # doctest: +ELLIPSIS
KMeans(copy_x=True, init='k-means++', k=3, max_iter=300,...
KMeans(copy_x=True, init='k-means++', ...
>>> print k_means.labels_[::10]
[1 1 1 1 1 0 0 0 0 0 2 2 2 2 2]
>>> print y_iris[::10]
Expand Down Expand Up @@ -117,9 +117,9 @@ algorithms. The simplest clustering algorithm is the
... from scipy import misc
... lena = misc.lena()
>>> X = lena.reshape((-1, 1)) # We need an (n_sample, n_feature) array
>>> k_means = cluster.KMeans(k=5, n_init=1)
>>> k_means = cluster.KMeans(n_clusters=5, n_init=1)
>>> k_means.fit(X) # doctest: +ELLIPSIS
KMeans(copy_x=True, init='k-means++', k=5, ...
KMeans(copy_x=True, init='k-means++', ...
>>> values = k_means.cluster_centers_.squeeze()
>>> labels = k_means.labels_
>>> lena_compressed = np.choose(labels, values)
Expand Down
4 changes: 2 additions & 2 deletions examples/cluster/plot_cluster_comparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,9 @@

# create clustering estimators
ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
two_means = cluster.MiniBatchKMeans(k=2)
two_means = cluster.MiniBatchKMeans(n_clusters=2)
ward_five = cluster.Ward(n_clusters=2, connectivity=connectivity)
spectral = cluster.SpectralClustering(k=2, mode='arpack')
spectral = cluster.SpectralClustering(n_clusters=2, mode='arpack')
dbscan = cluster.DBSCAN(eps=.2)
affinity_propagation = cluster.AffinityPropagation(damping=.9)

Expand Down
6 changes: 3 additions & 3 deletions examples/cluster/plot_cluster_iris.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,9 @@
X = iris.data
y = iris.target

estimators = {'k_means_iris_3': KMeans(k=3),
'k_means_iris_8': KMeans(k=8),
'k_means_iris_bad_init': KMeans(k=3, n_init=1, init='random'),
estimators = {'k_means_iris_3': KMeans(n_clusters=3),
'k_means_iris_8': KMeans(n_clusters=8),
'k_means_iris_bad_init': KMeans(n_clusters=3, n_init=1, init='random'),
}


Expand Down
2 changes: 1 addition & 1 deletion examples/cluster/plot_color_quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
print "Fitting estimator on a small sub-sample of the data"
t0 = time()
image_array_sample = shuffle(image_array, random_state=0)[:1000]
kmeans = KMeans(k=n_colors, random_state=0).fit(image_array_sample)
kmeans = KMeans(n_clusters=n_colors, random_state=0).fit(image_array_sample)
print "done in %0.3fs." % (time() - t0)

# Get labels for all points
Expand Down
2 changes: 1 addition & 1 deletion examples/cluster/plot_dbscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@

##############################################################################
# Compute DBSCAN
db = DBSCAN().fit(S, eps=0.95, min_samples=10)
db = DBSCAN(eps=0.95, min_samples=10).fit(S)
core_samples = db.core_sample_indices_
labels = db.labels_

Expand Down
9 changes: 5 additions & 4 deletions examples/cluster/plot_kmeans_digits.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,16 +72,16 @@ def bench_k_means(estimator, name, data):
sample_size=sample_size),
)

bench_k_means(KMeans(init='k-means++', k=n_digits, n_init=10),
bench_k_means(KMeans(init='k-means++', n_clusters=n_digits, n_init=10),
name="k-means++", data=data)

bench_k_means(KMeans(init='random', k=n_digits, n_init=10),
bench_k_means(KMeans(init='random', n_clusters=n_digits, n_init=10),
name="random", data=data)

# in this case the seeding of the centers is deterministic, hence we run the
# kmeans algorithm only once with n_init=1
pca = PCA(n_components=n_digits).fit(data)
bench_k_means(KMeans(init=pca.components_, k=n_digits, n_init=1),
bench_k_means(KMeans(init=pca.components_, n_clusters=n_digits, n_init=1),
name="PCA-based",
data=data)
print 79 * '_'
Expand All @@ -90,7 +90,8 @@ def bench_k_means(estimator, name, data):
# Visualize the results on PCA-reduced data

reduced_data = PCA(n_components=2).fit_transform(data)
kmeans = KMeans(init='k-means++', k=n_digits, n_init=10).fit(reduced_data)
kmeans = KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
kmeans.fit(reduced_data)

# Step size of the mesh. Decrease to increase the quality of the VQ.
h = .02 # point in the mesh [x_min, m_max]x[y_min, y_max].
Expand Down
9 changes: 3 additions & 6 deletions examples/cluster/plot_kmeans_stability_low_dim_dense.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,11 +87,8 @@ def make_data(random_state, n_samples_per_center, grid_size, scale):
for run_id in range(n_runs):
X, y = make_data(run_id, n_samples_per_center, grid_size, scale)
for i, n_init in enumerate(n_init_range):
km = factory(k=n_clusters,
init=init,
random_state=run_id,
n_init=n_init,
**params).fit(X)
km = factory(n_clusters=n_clusters, init=init, random_state=run_id,
n_init=n_init, **params).fit(X)
inertia[i, run_id] = km.inertia_
p = pl.errorbar(n_init_range, inertia.mean(axis=1), inertia.std(axis=1))
plots.append(p[0])
Expand All @@ -105,7 +102,7 @@ def make_data(random_state, n_samples_per_center, grid_size, scale):
# Part 2: Qualitative visual inspection of the convergence

X, y = make_data(random_state, n_samples_per_center, grid_size, scale)
km = MiniBatchKMeans(k=n_clusters, init='random', n_init=1,
km = MiniBatchKMeans(n_clusters=n_clusters, init='random', n_init=1,
random_state=random_state).fit(X)

fig = pl.figure()
Expand Down
2 changes: 1 addition & 1 deletion examples/cluster/plot_lena_compress.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
from scipy import misc
lena = misc.lena()
X = lena.reshape((-1, 1)) # We need an (n_sample, n_feature) array
k_means = cluster.KMeans(k=n_clusters, n_init=4)
k_means = cluster.KMeans(n_clusters=n_clusters, n_init=4)
k_means.fit(X)
values = k_means.cluster_centers_.squeeze()
labels = k_means.labels_
Expand Down
2 changes: 1 addition & 1 deletion examples/cluster/plot_lena_segmentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
# Apply spectral clustering (this step goes much faster if you have pyamg
# installed)
N_REGIONS = 11
labels = spectral_clustering(graph, k=N_REGIONS)
labels = spectral_clustering(graph, n_clusters=N_REGIONS)
labels = labels.reshape(lena.shape)

###############################################################################
Expand Down
4 changes: 2 additions & 2 deletions examples/cluster/plot_mini_batch_kmeans.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
##############################################################################
# Compute clustering with Means

k_means = KMeans(init='k-means++', k=3, n_init=10)
k_means = KMeans(init='k-means++', n_clusters=3, n_init=10)
t0 = time.time()
k_means.fit(X)
t_batch = time.time() - t0
Expand All @@ -46,7 +46,7 @@
##############################################################################
# Compute clustering with MiniBatchKMeans

mbk = MiniBatchKMeans(init='k-means++', k=3, batch_size=batch_size,
mbk = MiniBatchKMeans(init='k-means++', n_clusters=3, batch_size=batch_size,
n_init=10, max_no_improvement=10, verbose=0)
t0 = time.time()
mbk.fit(X)
Expand Down
4 changes: 2 additions & 2 deletions examples/cluster/plot_segmentation_toy.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@

# Force the solver to be arpack, since amg is numerically
# unstable on this example
labels = spectral_clustering(graph, k=4, mode='arpack')
labels = spectral_clustering(graph, n_clusters=4, mode='arpack')
label_im = -np.ones(mask.shape)
label_im[mask] = labels

Expand All @@ -88,7 +88,7 @@
graph = image.img_to_graph(img, mask=mask)
graph.data = np.exp(-graph.data / graph.data.std())

labels = spectral_clustering(graph, k=2, mode='arpack')
labels = spectral_clustering(graph, n_clusters=2, mode='arpack')
label_im = -np.ones(mask.shape)
label_im[mask] = labels

Expand Down
4 changes: 2 additions & 2 deletions examples/decomposition/plot_faces_decomposition.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,8 @@ def plot_gallery(title, images):
True, False),

('Cluster centers - MiniBatchKMeans',
MiniBatchKMeans(k=n_components, tol=1e-3, batch_size=20, max_iter=50,
random_state=rng),
MiniBatchKMeans(n_cluster=n_components, tol=1e-3, batch_size=20,
max_iter=50, random_state=rng),
True, False)
]

Expand Down
Loading

0 comments on commit faa53bd

Please sign in to comment.