scikit-learn · cricketsong · Jul 19, 2012 · Jul 20, 2012 · Jul 20, 2012 · Jul 24, 2012
diff --git a/doc/modules/neighbors.rst b/doc/modules/neighbors.rst
@@ -59,7 +59,7 @@ Nearest Neighbors Classification
 Neighbors-based classification is a type of *instance-based learning* or
 *non-generalizing learning*: it does not attempt to construct a general
 internal model, but simply stores instances of the training data.
-Classification is computed from a simple majority vote of the nearest
+The basic classification is computed from a simple majority vote of the nearest
 neighbors of each point: a query point is assigned the data class which
 has the most representatives within the nearest neighbors of the point.
 
@@ -94,7 +94,25 @@ be accomplished through the ``weights`` keyword.  The default value,
 distance from the query point.  Alternatively, a user-defined function of the
 distance can be supplied which is used to compute the weights.
 
-
+There is a probabilistic interpretation of nearest neighbors classification:
+a query point :math:`x` is assigned to the class
+:math:`C_k` to which it has the highest probability of belonging. This
+*posterior probability* is computed using Bayes' rule:
+:math:`P(C_k \mid  x) = \frac{P(x \mid C_k) P(C_k)}{P(x)}`.
+The basic nearest neighbors classification (when ``class_prior='default'``)
+uses a default *prior probability* :math:`P(C_k)` equal to the proportion of
+training points which belong to class :math:`C_k`. In contrast, using
+a flat prior (``class_prior='flat'``) assigns the same value (1 over the
+number of classes) to each class prior probability :math:`P(C_k)`.
+Alternatively, a user-defined list of the class prior probabilities (in
+increasing order of class labels) can be supplied which is used to classify
+the query points.
+
+The second example below illustrates the effect of assigning a much greater
+prior probability (0.8) to the first class (in red) than the other two: in
+regions where few data points appear, for example around the point (7, 4.5), 
+the model is more biased toward the red class than it was in the first 
+example.
 
 .. |classification_1| image:: ../auto_examples/neighbors/images/plot_classification_1.png
    :target: ../auto_examples/neighbors/plot_classification.html
@@ -111,14 +129,19 @@ distance can be supplied which is used to compute the weights.
   * :ref:`example_neighbors_plot_classification.py`: an example of
     classification using nearest neighbors.
 
+.. topic:: References:
+
+  *  `Pattern Recognition and Machine Learning`,
+     Bishop, C.M., New York: Springer (2006), p. 124-127
+
 .. _regression:
 
 Nearest Neighbors Regression
 ============================
 
 Neighbors-based regression can be used in cases where the data labels are
 continuous rather than discrete variables.  The label assigned to a query
-point is computed based the mean of the labels of its nearest neighbors.
+point is computed based on the mean of the labels of its nearest neighbors.
 
 scikit-learn implements two different neighbors regressors:
 :class:`KNeighborsRegressor` implements learning based on the :math:`k`

diff --git a/doc/tutorial/statistical_inference/supervised_learning.rst b/doc/tutorial/statistical_inference/supervised_learning.rst
@@ -95,8 +95,8 @@ Scikit-learn documentation for more information about this type of classifier.)
     >>> from sklearn.neighbors import KNeighborsClassifier
     >>> knn = KNeighborsClassifier()
     >>> knn.fit(iris_X_train, iris_y_train)
-    KNeighborsClassifier(algorithm='auto', leaf_size=30, n_neighbors=5, p=2,
-               warn_on_equidistant=True, weights='uniform')
+    KNeighborsClassifier(algorithm='auto', class_prior='default', leaf_size=30,
+               n_neighbors=5, p=2, warn_on_equidistant=True, weights='uniform')
     >>> knn.predict(iris_X_test)
     array([1, 2, 1, 0, 0, 0, 2, 1, 2, 0])
     >>> iris_y_test

diff --git a/examples/neighbors/plot_classification.py b/examples/neighbors/plot_classification.py
@@ -27,9 +27,11 @@
 cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
 cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
 
-for weights in ['uniform', 'distance']:
+for weights, class_prior in zip(['uniform', 'distance'],
+                                ['default', [0.8, 0.1, 0.1]]):
     # we create an instance of Neighbours Classifier and fit the data.
-    clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
+    clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights,
+                                         class_prior=class_prior)
     clf.fit(X, y)
 
     # Plot the decision boundary. For that, we will asign a color to each
@@ -47,8 +49,8 @@
 
     # Plot also the training points
     pl.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
-    pl.title("3-Class classification (k = %i, weights = '%s')"
-             % (n_neighbors, weights))
+    pl.title("3-Class classification (k = %i,\nweights = '%s', class_prior = '%s')"
+             % (n_neighbors, weights, class_prior))
     pl.axis('tight')
 
 pl.show()
diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py
@@ -15,6 +15,7 @@
 from ..base import BaseEstimator
 from ..metrics import pairwise_distances
 from ..utils import safe_asarray, atleast2d_or_csr
+from ..utils.fixes import unique
 
 
 class NeighborsWarning(UserWarning):
@@ -45,14 +46,14 @@ def _get_weights(dist, weights):
     """Get the weights from an array of distances and a parameter ``weights``
 
     Parameters
-    ===========
+    ----------
     dist: ndarray
         The input distances
     weights: {'uniform', 'distance' or a callable}
         The kind of weighting used
 
     Returns
-    ========
+    -------
     weights_arr: array of the same shape as ``dist``
         if ``weights == 'uniform'``, then returns None
     """
@@ -68,6 +69,39 @@ def _get_weights(dist, weights):
         raise ValueError("weights not recognized: should be 'uniform', "
                             "'distance', or a callable function")
 
+def _check_class_prior(class_prior):
+    """Check to make sure class prior is valid."""
+    if class_prior in (None, 'default', 'flat'):
+        return class_prior
+    elif isinstance(class_prior, (list, np.ndarray)):
+        return class_prior
+    else:
+        raise ValueError("class prior not recognized: should be 'default', "
+                         "'flat', or a list or ndarray")
+
+def _get_class_prior(y, class_prior):
+    """Get class prior from targets ``y`` and parameter ``class_prior``
+
+    Parameters
+    ----------
+    y : ndarray
+        The target labels, from 0 to ``n-1`` (thus ``n`` classes)
+    class_prior: {'default', 'flat' or a dict}
+        The class prior probabilities to use
+
+    Returns
+    -------
+    class_prior_arr: array of the same shape as ``np.unique(y)``
+    """
+    if class_prior in (None, 'default'):
+        return np.bincount(y).astype(float) / len(y)
+    elif class_prior == 'flat':
+        return np.ones((len(np.unique(y)),)) / len(np.unique(y))
+    elif isinstance(class_prior, (list, np.ndarray)):
+        return class_prior
+    else:
+        raise ValueError("class prior not recognized: should be 'default', "
+                         "'flat', or a list or ndarray")
 
 class NeighborsBase(BaseEstimator):
     """Base class for nearest neighbors estimators."""
@@ -567,8 +601,7 @@ def fit(self, X, y):
         y : {array-like, sparse matrix}, shape = [n_samples]
             Target values, array of integer values.
         """
-        self._y = np.asarray(y)
-        self._classes = np.sort(np.unique(y))
+        self._classes, self._y = unique(y, return_inverse=True)
         return self._fit(X)