Skip to content
Browse files

FIX: make normalizer use the real l1 norm on each row (without assumi…

…ng positive values)
  • Loading branch information...
1 parent 7b70505 commit c674b6cb8d2b35f6665fcb28c3d690f175da66b1 @ogrisel ogrisel committed Apr 27, 2011
View
2 scikits/learn/preprocessing/__init__.py
@@ -81,7 +81,7 @@ def fit(self, X, **params):
def transform(self, X, copy=True):
if copy:
X = X.copy()
- norms = X.sum(axis=1)[:, np.newaxis]
+ norms = np.abs(X).sum(axis=1)[:, np.newaxis]
@mblondel
scikit-learn member
mblondel added a note Apr 28, 2011

Too bad there isn't a np.abssum function. The above makes a copy of the dataset. Or likewise, too bad that linalg.norm doesn't have an axis argument.

@ogrisel
scikit-learn member
ogrisel added a note Apr 28, 2011

If we get rid off the sparse subpackage we can write a cython function with fabs from math.h for the dense case as well.

@mblondel
scikit-learn member
mblondel added a note Apr 28, 2011
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
norms[norms == 0.0] = 1.0
X /= norms
View
4,045 scikits/learn/preprocessing/sparse/src/_preprocessing.c
2,543 additions, 1,502 deletions not shown because the diff is too large. Please use a local Git client to view these changes.
View
13 scikits/learn/preprocessing/sparse/src/_preprocessing.pyx
@@ -8,6 +8,7 @@ import numpy.linalg as linalg
cimport cython
cdef extern from "math.h":
+ double fabs(double f)
double sqrt(double f)
ctypedef np.float64_t DOUBLE
@@ -17,6 +18,7 @@ ctypedef np.int32_t INTEGER
@cython.wraparound(False)
@cython.cdivision(True)
def normalize_axis1_sparse(X):
+ """Inplace row normalize using the l1 norm"""
cdef unsigned int n_samples = X.shape[0]
cdef unsigned int n_features = X.shape[1]
@@ -33,16 +35,17 @@ def normalize_axis1_sparse(X):
for i in xrange(n_samples):
sum_ = 0.0
- for j in xrange(X_indptr[i], X_indptr[i+1]):
- sum_ += X_data[j]
+ for j in xrange(X_indptr[i], X_indptr[i + 1]):
+ sum_ += fabs(X_data[j])
- for j in xrange(X_indptr[i], X_indptr[i+1]):
+ for j in xrange(X_indptr[i], X_indptr[i + 1]):
X_data[j] /= sum_
@cython.boundscheck(False)
@cython.wraparound(False)
@cython.cdivision(True)
def normalize_length_axis1_sparse(X):
+ """Inplace row normalize using the l2 norm"""
cdef unsigned int n_samples = X.shape[0]
cdef unsigned int n_features = X.shape[1]
@@ -57,11 +60,11 @@ def normalize_length_axis1_sparse(X):
for i in xrange(n_samples):
sum_ = 0.0
- for j in xrange(X_indptr[i], X_indptr[i+1]):
+ for j in xrange(X_indptr[i], X_indptr[i + 1]):
sum_ += (X_data[j] * X_data[j])
sum_ = sqrt(sum_)
- for j in xrange(X_indptr[i], X_indptr[i+1]):
+ for j in xrange(X_indptr[i], X_indptr[i + 1]):
X_data[j] /= sum_
View
6 scikits/learn/preprocessing/tests/test_preprocessing.py
@@ -70,13 +70,15 @@ def test_normalizer():
X_norm = normalizer.transform(X, copy=True)
assert X_norm is not X
X_norm = toarray(X_norm)
- assert_array_almost_equal(X_norm.sum(axis=1), np.ones(X.shape[0]))
+ assert_array_almost_equal(
+ np.abs(X_norm).sum(axis=1), np.ones(X.shape[0]))
normalizer = klass()
X_norm = normalizer.transform(X, copy=False)
assert X_norm is X
X_norm = toarray(X_norm)
- assert_array_almost_equal(X_norm.sum(axis=1), np.ones(X.shape[0]))
+ assert_array_almost_equal(
+ np.abs(X_norm).sum(axis=1), np.ones(X.shape[0]))
def test_length_normalizer():

0 comments on commit c674b6c

Please sign in to comment.
Something went wrong with that request. Please try again.