diff --git a/doc/datasets/rcv1_fixture.py b/doc/datasets/rcv1_fixture.py
index c409f2f937a64..75d93d7ec568e 100644
--- a/doc/datasets/rcv1_fixture.py
+++ b/doc/datasets/rcv1_fixture.py
@@ -1,7 +1,7 @@
 """Fixture module to skip the datasets loading when offline
 
 The RCV1 data is rather large and some CI workers such as travis are
-stateless hence will not cache the dataset as regular sklearn users would do.
+stateless hence will not cache the dataset as regular scikit-learn users would do.
 
 The following will skip the execution of the rcv1.rst doctests
 if the proper environment variable is configured (see the source code of
diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst
index c2b75d6630c04..c705d8dd93906 100644
--- a/doc/developers/contributing.rst
+++ b/doc/developers/contributing.rst
@@ -871,9 +871,9 @@ an integer called ``n_iter``.
 Rolling your own estimator
 ==========================
 If you want to implement a new estimator that is scikit-learn-compatible,
-whether it is just for you or for contributing it to sklearn, there are several
-internals of scikit-learn that you should be aware of in addition to the
-sklearn API outlined above. You can check whether your estimator
+whether it is just for you or for contributing it to scikit-learn, there are
+several internals of scikit-learn that you should be aware of in addition to
+the scikit-learn API outlined above. You can check whether your estimator
 adheres to the scikit-learn interface and standards by running
 :func:`utils.estimator_checks.check_estimator` on the class::
 
@@ -929,7 +929,7 @@ E.g., below is a custom classifier. For more information on this example, see
 
 get_params and set_params
 -------------------------
-All sklearn estimator have ``get_params`` and ``set_params`` functions.
+All scikit-learn estimators have ``get_params`` and ``set_params`` functions.
 The ``get_params`` function takes no arguments and returns a dict of the
 ``__init__`` parameters of the estimator, together with their values.
 It must take one keyword argument, ``deep``,
diff --git a/doc/modules/gaussian_process.rst b/doc/modules/gaussian_process.rst
index 52211a154d8db..fb408c4acd714 100644
--- a/doc/modules/gaussian_process.rst
+++ b/doc/modules/gaussian_process.rst
@@ -66,7 +66,7 @@ WhiteKernel component into the kernel, which can estimate the global noise
 level from the data (see example below).
 
 The implementation is based on Algorithm 2.1 of [RW2006]_. In addition to
-the API of standard sklearn estimators, GaussianProcessRegressor:
+the API of standard scikit-learn estimators, GaussianProcessRegressor:
 
 * allows prediction without prior fitting (based on the GP prior)
 
@@ -164,7 +164,7 @@ than just predicting the mean.
 GPR on Mauna Loa CO2 data
 -------------------------
 
-This example is based on Section 5.4.3 of [RW2006]_. 
+This example is based on Section 5.4.3 of [RW2006]_.
 It illustrates an example of complex kernel engineering and
 hyperparameter optimization using gradient ascent on the
 log-marginal-likelihood. The data consists of the monthly average atmospheric
@@ -602,11 +602,11 @@ References
 ----------
 
     * `[RW2006]
-      <http://www.gaussianprocess.org/gpml/chapters/>`_ 
-      **Gaussian Processes for Machine Learning**, 
-      Carl Eduard Rasmussen and Christopher K.I. Williams, MIT Press 2006. 
-      Link to an official complete PDF version of the book 
-      `here <http://www.gaussianprocess.org/gpml/chapters/RW.pdf>`_ . 
+      <http://www.gaussianprocess.org/gpml/chapters/>`_
+      **Gaussian Processes for Machine Learning**,
+      Carl Eduard Rasmussen and Christopher K.I. Williams, MIT Press 2006.
+      Link to an official complete PDF version of the book
+      `here <http://www.gaussianprocess.org/gpml/chapters/RW.pdf>`_ .
 
 .. currentmodule:: sklearn.gaussian_process
 
@@ -616,9 +616,9 @@ References
 Legacy Gaussian Processes
 =========================
 
-In this section, the implementation of Gaussian processes used in sklearn until
-release 0.16.1 is described. Note that this implementation is deprecated and
-will be removed in version 0.18.
+In this section, the implementation of Gaussian processes used in scikit-learn
+until release 0.16.1 is described. Note that this implementation is deprecated
+and will be removed in version 0.18.
 
 An introductory regression example
 ----------------------------------
diff --git a/doc/modules/manifold.rst b/doc/modules/manifold.rst
index 4fc05bb05d23e..3c003e0d0cb50 100644
--- a/doc/modules/manifold.rst
+++ b/doc/modules/manifold.rst
@@ -59,10 +59,10 @@ interesting structure within the data will be lost.
 
 To address this concern, a number of supervised and unsupervised linear
 dimensionality reduction frameworks have been designed, such as Principal
-Component Analysis (PCA), Independent Component Analysis, Linear 
-Discriminant Analysis, and others.  These algorithms define specific 
+Component Analysis (PCA), Independent Component Analysis, Linear
+Discriminant Analysis, and others.  These algorithms define specific
 rubrics to choose an "interesting" linear projection of the data.
-These methods can be powerful, but often miss important non-linear 
+These methods can be powerful, but often miss important non-linear
 structure in the data.
 
 
@@ -91,7 +91,7 @@ from the data itself, without the use of predetermined classifications.
     * See :ref:`sphx_glr_auto_examples_manifold_plot_compare_methods.py` for an example of
       dimensionality reduction on a toy "S-curve" dataset.
 
-The manifold learning implementations available in sklearn are
+The manifold learning implementations available in scikit-learn are
 summarized below
 
 .. _isomap:
@@ -121,13 +121,13 @@ The Isomap algorithm comprises three stages:
    nearest neighbors of :math:`N` points in :math:`D` dimensions.
 
 2. **Shortest-path graph search.**  The most efficient known algorithms
-   for this are *Dijkstra's Algorithm*, which is approximately 
+   for this are *Dijkstra's Algorithm*, which is approximately
    :math:`O[N^2(k + \log(N))]`, or the *Floyd-Warshall algorithm*, which
    is :math:`O[N^3]`.  The algorithm can be selected by the user with
    the ``path_method`` keyword of ``Isomap``.  If unspecified, the code
    attempts to choose the best algorithm for the input data.
 
-3. **Partial eigenvalue decomposition.**  The embedding is encoded in the 
+3. **Partial eigenvalue decomposition.**  The embedding is encoded in the
    eigenvectors corresponding to the :math:`d` largest eigenvalues of the
    :math:`N \times N` isomap kernel.  For a dense solver, the cost is
    approximately :math:`O[d N^2]`.  This cost can often be improved using
@@ -191,7 +191,7 @@ The overall complexity of standard LLE is
 * :math:`d` : output dimension
 
 .. topic:: References:
-   
+
    * `"Nonlinear dimensionality reduction by locally linear embedding"
      <http://www.sciencemag.org/content/290/5500/2323.full>`_
      Roweis, S. & Saul, L.  Science 290:2323 (2000)
@@ -221,7 +221,7 @@ It requires ``n_neighbors > n_components``.
    :target: ../auto_examples/manifold/plot_lle_digits.html
    :align: center
    :scale: 50
-   
+
 Complexity
 ----------
 
@@ -232,7 +232,7 @@ The MLLE algorithm comprises three stages:
 2. **Weight Matrix Construction**. Approximately
    :math:`O[D N k^3] + O[N (k-D) k^2]`.  The first term is exactly equivalent
    to that of standard LLE.  The second term has to do with constructing the
-   weight matrix from multiple weights.  In practice, the added cost of 
+   weight matrix from multiple weights.  In practice, the added cost of
    constructing the MLLE weight matrix is relatively small compared to the
    cost of steps 1 and 3.
 
@@ -247,7 +247,7 @@ The overall complexity of MLLE is
 * :math:`d` : output dimension
 
 .. topic:: References:
-     
+
    * `"MLLE: Modified Locally Linear Embedding Using Multiple Weights"
      <http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.70.382>`_
      Zhang, Z. & Wang, J.
@@ -271,7 +271,7 @@ It requires ``n_neighbors > n_components * (n_components + 3) / 2``.
    :target: ../auto_examples/manifold/plot_lle_digits.html
    :align: center
    :scale: 50
-   
+
 Complexity
 ----------
 
@@ -308,10 +308,10 @@ Spectral Embedding
 Spectral Embedding (also known as Laplacian Eigenmaps) is one method
 to calculate non-linear embedding. It finds a low dimensional representation
 of the data using a spectral decomposition of the graph Laplacian.
-The graph generated can be considered as a discrete approximation of the 
-low dimensional manifold in the high dimensional space. Minimization of a 
-cost function based on the graph ensures that points close to each other on 
-the manifold are mapped close to each other in the low dimensional space, 
+The graph generated can be considered as a discrete approximation of the
+low dimensional manifold in the high dimensional space. Minimization of a
+cost function based on the graph ensures that points close to each other on
+the manifold are mapped close to each other in the low dimensional space,
 preserving local distances. Spectral embedding can be  performed with the
 function :func:`spectral_embedding` or its object-oriented counterpart
 :class:`SpectralEmbedding`.
@@ -326,9 +326,9 @@ The Spectral Embedding algorithm comprises three stages:
 
 2. **Graph Laplacian Construction**. unnormalized Graph Laplacian
    is constructed as :math:`L = D - A` for and normalized one as
-   :math:`L = D^{-\frac{1}{2}} (D - A) D^{-\frac{1}{2}}`.  
+   :math:`L = D^{-\frac{1}{2}} (D - A) D^{-\frac{1}{2}}`.
 
-3. **Partial Eigenvalue Decomposition**. Eigenvalue decomposition is 
+3. **Partial Eigenvalue Decomposition**. Eigenvalue decomposition is
    done on graph Laplacian
 
 The overall complexity of spectral embedding is
@@ -342,7 +342,7 @@ The overall complexity of spectral embedding is
 .. topic:: References:
 
    * `"Laplacian Eigenmaps for Dimensionality Reduction
-     and Data Representation" 
+     and Data Representation"
      <http://web.cse.ohio-state.edu/~mbelkin/papers/LEM_NC_03.pdf>`_
      M. Belkin, P. Niyogi, Neural Computation, June 2003; 15 (6):1373-1396
 
@@ -354,7 +354,7 @@ Though not technically a variant of LLE, Local tangent space alignment (LTSA)
 is algorithmically similar enough to LLE that it can be put in this category.
 Rather than focusing on preserving neighborhood distances as in LLE, LTSA
 seeks to characterize the local geometry at each neighborhood via its
-tangent space, and performs a global optimization to align these local 
+tangent space, and performs a global optimization to align these local
 tangent spaces to learn the embedding.  LTSA can be performed with function
 :func:`locally_linear_embedding` or its object-oriented counterpart
 :class:`LocallyLinearEmbedding`, with the keyword ``method = 'ltsa'``.
@@ -421,7 +421,7 @@ space and the similarities/dissimilarities.
    :target: ../auto_examples/manifold/plot_lle_digits.html
    :align: center
    :scale: 50
- 
+
 
 Let :math:`S` be the similarity matrix, and :math:`X` the coordinates of the
 :math:`n` input points. Disparities :math:`\hat{d}_{ij}` are transformation of
@@ -456,7 +456,7 @@ order to avoid that, the disparities :math:`\hat{d}_{ij}` are normalized.
    :target: ../auto_examples/manifold/plot_mds.html
    :align: center
    :scale: 60
-  
+
 
 .. topic:: References:
 
@@ -499,7 +499,7 @@ probabilities in the original space and the embedded space will be minimized
 by gradient descent. Note that the KL divergence is not convex, i.e.
 multiple restarts with different initializations will end up in local minima
 of the KL divergence. Hence, it is sometimes useful to try different seeds
-and select the embedding with the lowest KL divergence. 
+and select the embedding with the lowest KL divergence.
 
 The disadvantages to using t-SNE are roughly:
 
@@ -552,7 +552,7 @@ divergence will increase during optimization. More tips can be found in
 Laurens van der Maaten's FAQ (see references). The last parameter, angle,
 is a tradeoff between performance and accuracy. Larger angles imply that we
 can approximate larger regions by a single point,leading to better speed
-but less accurate results. 
+but less accurate results.
 
 Barnes-Hut t-SNE
 ----------------
@@ -560,8 +560,8 @@ Barnes-Hut t-SNE
 The Barnes-Hut t-SNE that has been implemented here is usually much slower than
 other manifold learning algorithms. The optimization is quite difficult
 and the computation of the gradient is :math:`O[d N log(N)]`, where :math:`d`
-is the number of output dimensions and :math:`N` is the number of samples. The 
-Barnes-Hut method improves on the exact method where t-SNE complexity is 
+is the number of output dimensions and :math:`N` is the number of samples. The
+Barnes-Hut method improves on the exact method where t-SNE complexity is
 :math:`O[d N^2]`, but has several other notable differences:
 
 * The Barnes-Hut implementation only works when the target dimensionality is 3
diff --git a/doc/tutorial/statistical_inference/model_selection.rst b/doc/tutorial/statistical_inference/model_selection.rst
index 475e1c5e5b385..ef3568ffb0bcd 100644
--- a/doc/tutorial/statistical_inference/model_selection.rst
+++ b/doc/tutorial/statistical_inference/model_selection.rst
@@ -207,7 +207,7 @@ Grid-search
 
 .. currentmodule:: sklearn.model_selection
 
-The sklearn provides an object that, given data, computes the score
+scikit-learn provides an object that, given data, computes the score
 during the fit of an estimator on a parameter grid and chooses the
 parameters to maximize the cross-validation score. This object takes an
 estimator during the construction and exposes an estimator API::
@@ -257,9 +257,9 @@ Cross-validated estimators
 ----------------------------
 
 Cross-validation to set a parameter can be done more efficiently on an
-algorithm-by-algorithm basis. This is why for certain estimators the
-sklearn exposes :ref:`cross_validation` estimators that set their parameter
-automatically by cross-validation::
+algorithm-by-algorithm basis. This is why, for certain estimators,
+scikit-learn exposes :ref:`cross_validation` estimators that set their
+parameter automatically by cross-validation::
 
     >>> from sklearn import linear_model, datasets
     >>> lasso = linear_model.LassoCV()
diff --git a/doc/tutorial/text_analytics/working_with_text_data_fixture.py b/doc/tutorial/text_analytics/working_with_text_data_fixture.py
index c10f5c4f4d142..d620986d07156 100644
--- a/doc/tutorial/text_analytics/working_with_text_data_fixture.py
+++ b/doc/tutorial/text_analytics/working_with_text_data_fixture.py
@@ -1,7 +1,7 @@
 """Fixture module to skip the datasets loading when offline
 
 The 20 newsgroups data is rather large and some CI workers such as travis are
-stateless hence will not cache the dataset as regular sklearn users would do.
+stateless hence will not cache the dataset as regular scikit-learn users would.
 
 The following will skip the execution of the working_with_text_data.rst doctests
 if the proper environment variable is configured (see the source code of
diff --git a/examples/hetero_feature_union.py b/examples/hetero_feature_union.py
index 0ec565a8f5431..0af24a3c05c34 100644
--- a/examples/hetero_feature_union.py
+++ b/examples/hetero_feature_union.py
@@ -51,7 +51,7 @@ class ItemSelector(BaseEstimator, TransformerMixin):
 
     >> len(data[key]) == n_samples
 
-    Please note that this is the opposite convention to sklearn feature
+    Please note that this is the opposite convention to scikit-learn feature
     matrixes (where the first index corresponds to sample).
 
     ItemSelector only requires that the collection implement getitem
diff --git a/sklearn/covariance/tests/test_graph_lasso.py b/sklearn/covariance/tests/test_graph_lasso.py
index f8da99ce3a5f8..bc2c8339da215 100644
--- a/sklearn/covariance/tests/test_graph_lasso.py
+++ b/sklearn/covariance/tests/test_graph_lasso.py
@@ -61,8 +61,8 @@ def test_graph_lasso(random_state=0):
 
 def test_graph_lasso_iris():
     # Hard-coded solution from R glasso package for alpha=1.0
-    # The iris datasets in R and sklearn do not match in a few places, these
-    # values are for the sklearn version
+    # The iris datasets in R and scikit-learn do not match in a few places,
+    # these values are for the scikit-learn version.
     cov_R = np.array([
         [0.68112222, 0.0, 0.2651911, 0.02467558],
         [0.00, 0.1867507, 0.0, 0.00],
diff --git a/sklearn/datasets/mldata.py b/sklearn/datasets/mldata.py
index 3768435fe489b..1ab3edea91bde 100644
--- a/sklearn/datasets/mldata.py
+++ b/sklearn/datasets/mldata.py
@@ -103,7 +103,7 @@ def fetch_mldata(dataname, target_name='label', data_name='data',
     (150, 4)
 
     Load the 'leukemia' dataset from mldata.org, which needs to be transposed
-    to respects the sklearn axes convention:
+    to respects the scikit-learn axes convention:
 
     >>> leuk = fetch_mldata('leukemia', transpose_data=True,
     ...                     data_home=test_data_home)
@@ -205,7 +205,7 @@ def fetch_mldata(dataname, target_name='label', data_name='data',
             del dataset[col_names[1]]
             dataset['data'] = matlab_dict[col_names[1]]
 
-    # set axes to sklearn conventions
+    # set axes to scikit-learn conventions
     if transpose_data:
         dataset['data'] = dataset['data'].T
     if 'target' in dataset:
diff --git a/sklearn/feature_extraction/image.py b/sklearn/feature_extraction/image.py
index 7b9c5c94e0cdf..f4bfd7e533894 100644
--- a/sklearn/feature_extraction/image.py
+++ b/sklearn/feature_extraction/image.py
@@ -152,8 +152,8 @@ def img_to_graph(img, mask=None, return_as=sparse.coo_matrix, dtype=None):
 
     Notes
     -----
-    For sklearn versions 0.14.1 and prior, return_as=np.ndarray was handled
-    by returning a dense np.matrix instance.  Going forward, np.ndarray
+    For scikit-learn versions 0.14.1 and prior, return_as=np.ndarray was
+    handled by returning a dense np.matrix instance.  Going forward, np.ndarray
     returns an np.ndarray, as expected.
 
     For compatibility, user code relying on this method should wrap its
@@ -188,8 +188,8 @@ def grid_to_graph(n_x, n_y, n_z=1, mask=None, return_as=sparse.coo_matrix,
 
     Notes
     -----
-    For sklearn versions 0.14.1 and prior, return_as=np.ndarray was handled
-    by returning a dense np.matrix instance.  Going forward, np.ndarray
+    For scikit-learn versions 0.14.1 and prior, return_as=np.ndarray was
+    handled by returning a dense np.matrix instance.  Going forward, np.ndarray
     returns an np.ndarray, as expected.
 
     For compatibility, user code relying on this method should wrap its
diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index e04f2a6798b2e..24ff1b058ae8b 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -23,7 +23,8 @@ class GaussianProcessRegressor(BaseEstimator, RegressorMixin):
     The implementation is based on Algorithm 2.1 of Gaussian Processes
     for Machine Learning (GPML) by Rasmussen and Williams.
 
-    In addition to standard sklearn estimator API, GaussianProcessRegressor:
+    In addition to standard scikit-learn estimator API,
+    GaussianProcessRegressor:
 
        * allows prediction without prior fitting (based on the GP prior)
        * provides an additional method sample_y(X), which evaluates samples
diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py
index 530dc6068a32f..01863166953d6 100644
--- a/sklearn/metrics/tests/test_pairwise.py
+++ b/sklearn/metrics/tests/test_pairwise.py
@@ -61,7 +61,8 @@ def test_pairwise_distances():
     Y_tuples = tuple([tuple([v for v in row]) for row in Y])
     S2 = pairwise_distances(X_tuples, Y_tuples, metric="euclidean")
     assert_array_almost_equal(S, S2)
-    # "cityblock" uses sklearn metric, cityblock (function) is scipy.spatial.
+    # "cityblock" uses scikit-learn metric, cityblock (function) is
+    # scipy.spatial.
     S = pairwise_distances(X, metric="cityblock")
     S2 = pairwise_distances(X, metric=cityblock)
     assert_equal(S.shape[0], S.shape[1])
@@ -78,7 +79,8 @@ def test_pairwise_distances():
     S3 = manhattan_distances(X, Y, size_threshold=10)
     assert_array_almost_equal(S, S3)
     # Test cosine as a string metric versus cosine callable
-    # "cosine" uses sklearn metric, cosine (function) is scipy.spatial
+    # The string "cosine" uses sklearn.metric,
+    # while the function cosine is scipy.spatial
     S = pairwise_distances(X, Y, metric="cosine")
     S2 = pairwise_distances(X, Y, metric=cosine)
     assert_equal(S.shape[0], X.shape[0])
@@ -330,7 +332,7 @@ def test_pairwise_distances_argmin_min():
     assert_equal(type(Dsp), np.ndarray)
     assert_equal(type(Esp), np.ndarray)
 
-    # Non-euclidean sklearn metric
+    # Non-euclidean scikit-learn metric
     D, E = pairwise_distances_argmin_min(X, Y, metric="manhattan")
     D2 = pairwise_distances_argmin(X, Y, metric="manhattan")
     assert_array_almost_equal(D, [0, 1])
diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py
index 6f4be0dcc8ab7..3770168a0b18a 100644
--- a/sklearn/tests/test_base.py
+++ b/sklearn/tests/test_base.py
@@ -73,7 +73,7 @@ def predict(self, X=None):
 
 
 class VargEstimator(BaseEstimator):
-    """Sklearn estimators shouldn't have vargs."""
+    """scikit-learn estimators shouldn't have vargs."""
     def __init__(self, *vargs):
         pass
 
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index aaf174906f960..677c47595e3ed 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -220,7 +220,7 @@ def _yield_all_checks(name, Estimator):
 
 
 def check_estimator(Estimator):
-    """Check if estimator adheres to sklearn conventions.
+    """Check if estimator adheres to scikit-learn conventions.
 
     This estimator will run an extensive test-suite for input validation,
     shapes, etc.