[MRG+1] Fix return_norm bug in preprocessing.normalize (scikit-learn#…

…7789) Fixes scikit-learn#7771
sergeyf · Feb 28, 2017 · 442699d · 442699d
1 parent 5762ab3
commit 442699d
Show file tree

Hide file tree

Showing 3 changed files with 40 additions and 3 deletions.
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -85,6 +85,11 @@ Enhancements
      do not set attributes on the estimator.
      :issue:`7533` by :user:`Ekaterina Krivich <kiote>`.
 
+   - For sparse matrices, :func:`preprocessing.normalize` with ``return_norm=True``
+     will now raise a ``NotImplementedError`` with 'l1' or 'l2' norm and with norm 'max'
+     the norms returned will be the same as for dense matrices (:issue:`7771`).
+     By `Ang Lu <https://github.com/luang008>`_.
+
 Bug fixes
 .........
 

diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
@@ -1325,6 +1325,16 @@ def normalize(X, norm='l2', axis=1, copy=True, return_norm=False):
     return_norm : boolean, default False
         whether to return the computed norms
 
+    Returns
+    -------
+    X : {array-like, sparse matrix}, shape [n_samples, n_features]
+        Normalized input X.
+
+    norms : array, shape [n_samples] if axis=1 else [n_features]
+        An array of norms along given axis for X.
+        When X is sparse, a NotImplementedError will be raised
+        for norm 'l1' or 'l2'.
+
     See also
     --------
     Normalizer: Performs normalization using the ``Transformer`` API
@@ -1346,15 +1356,19 @@ def normalize(X, norm='l2', axis=1, copy=True, return_norm=False):
         X = X.T
 
     if sparse.issparse(X):
+        if return_norm and norm in ('l1', 'l2'):
+            raise NotImplementedError("return_norm=True is not implemented "
+                                      "for sparse matrices with norm 'l1' "
+                                      "or norm 'l2'")
         if norm == 'l1':
             inplace_csr_row_normalize_l1(X)
         elif norm == 'l2':
             inplace_csr_row_normalize_l2(X)
         elif norm == 'max':
             _, norms = min_max_axis(X, 1)
-            norms = norms.repeat(np.diff(X.indptr))
-            mask = norms != 0
-            X.data[mask] /= norms[mask]
+            norms_elementwise = norms.repeat(np.diff(X.indptr))
+            mask = norms_elementwise != 0
+            X.data[mask] /= norms_elementwise[mask]
     else:
         if norm == 'l1':
             norms = np.abs(X).sum(axis=1)

diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
@@ -1315,6 +1315,24 @@ def test_normalize():
 
                 assert_array_almost_equal(row_sums, ones)
 
+    # Test return_norm
+    X_dense = np.array([[3.0, 0, 4.0], [1.0, 0.0, 0.0], [2.0, 3.0, 0.0]])
+    for norm in ('l1', 'l2', 'max'):
+        _, norms = normalize(X_dense, norm=norm, return_norm=True)
+        if norm == 'l1':
+            assert_array_almost_equal(norms, np.array([7.0, 1.0, 5.0]))
+        elif norm == 'l2':
+            assert_array_almost_equal(norms, np.array([5.0, 1.0, 3.60555127]))
+        else:
+            assert_array_almost_equal(norms, np.array([4.0, 1.0, 3.0]))
+
+    X_sparse = sparse.csr_matrix(X_dense)
+    for norm in ('l1', 'l2'):
+        assert_raises(NotImplementedError, normalize, X_sparse,
+                      norm=norm, return_norm=True)
+    _, norms = normalize(X_sparse, norm='max', return_norm=True)
+    assert_array_almost_equal(norms, np.array([4.0, 1.0, 3.0]))
+
 
 def test_binarizer():
     X_ = np.array([[1, 0, 5], [2, 3, -1]])