diff --git a/sklearn/linear_model/logistic.py b/sklearn/linear_model/logistic.py index 78b44717422c0..e863f7d45c3e7 100644 --- a/sklearn/linear_model/logistic.py +++ b/sklearn/linear_model/logistic.py @@ -36,6 +36,10 @@ class LogisticRegression(BaseLibLinear, LinearClassifierMixin, Specifies if a constant (a.k.a. bias or intercept) should be added the decision function. + normalize : boolean, optional, default False + If True, the training data X will be normalized before + logistic regression. + intercept_scaling : float, default: 1 when self.fit_intercept is True, instance vector x becomes [x, self.intercept_scaling], @@ -95,12 +99,13 @@ class LogisticRegression(BaseLibLinear, LinearClassifierMixin, """ def __init__(self, penalty='l2', dual=False, tol=1e-4, C=1.0, - fit_intercept=True, intercept_scaling=1, class_weight=None, - random_state=None): + fit_intercept=True, normalize=False, intercept_scaling=1, + class_weight=None, random_state=None): super(LogisticRegression, self).__init__( penalty=penalty, dual=dual, loss='lr', tol=tol, C=C, - fit_intercept=fit_intercept, intercept_scaling=intercept_scaling, + fit_intercept=fit_intercept, normalize=normalize, + intercept_scaling=intercept_scaling, class_weight=class_weight, random_state=random_state) def predict_proba(self, X): diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py index f3d11ed0581e1..e319317052671 100644 --- a/sklearn/linear_model/tests/test_logistic.py +++ b/sklearn/linear_model/tests/test_logistic.py @@ -155,3 +155,23 @@ def test_liblinear_random_state(): lr2 = logistic.LogisticRegression(random_state=0) lr2.fit(X, y) assert_array_almost_equal(lr1.coef_, lr2.coef_) + + +def test_normalize(): + """Test for normalize option in LogisticRegression + to verify that the prediction with an array that has already been + normalized is the same as if normalize option is enabled. + """ + X, y = iris.data, iris.target + X_norm = (X - np.mean(X, axis=0)) / np.std(X, axis=0) + for kwargs in ( + {}, + {'fit_intercept': False}, + {'intercept_scaling': 0.01}): + lr1 = logistic.LogisticRegression(normalize=False, **kwargs) + lr1.fit(X_norm, y) + lr2 = logistic.LogisticRegression(normalize=True, **kwargs) + lr2.fit(X, y) + pred1 = lr1.predict_proba(X_norm) + pred2 = lr2.predict_proba(X) + assert_array_almost_equal(pred1, pred2) diff --git a/sklearn/svm/base.py b/sklearn/svm/base.py index ccd05983ce5cd..b8eda3029a9ab 100644 --- a/sklearn/svm/base.py +++ b/sklearn/svm/base.py @@ -585,8 +585,9 @@ class BaseLibLinear(six.with_metaclass(ABCMeta, BaseEstimator)): @abstractmethod def __init__(self, penalty='l2', loss='l2', dual=True, tol=1e-4, C=1.0, - multi_class='ovr', fit_intercept=True, intercept_scaling=1, - class_weight=None, verbose=0, random_state=None): + multi_class='ovr', fit_intercept=True, normalize=False, + intercept_scaling=1, class_weight=None, verbose=0, + random_state=None): self.penalty = penalty self.loss = loss @@ -594,6 +595,7 @@ def __init__(self, penalty='l2', loss='l2', dual=True, tol=1e-4, C=1.0, self.tol = tol self.C = C self.fit_intercept = fit_intercept + self.normalize = normalize self.intercept_scaling = intercept_scaling self.multi_class = multi_class self.class_weight = class_weight @@ -659,8 +661,8 @@ def fit(self, X, y): raise ValueError("The number of classes has to be greater than" " one.") - X = atleast2d_or_csr(X, dtype=np.float64, order="C") - + X = atleast2d_or_csr(X, dtype=np.float64, order="C", + copy=self.normalize) self.class_weight_ = compute_class_weight(self.class_weight, self.classes_, y) @@ -677,6 +679,17 @@ def fit(self, X, y): # LibLinear wants targets as doubles, even for classification y = np.asarray(y, dtype=np.float64).ravel() + + # Center data if self.normalize + if self.normalize: + if not sp.issparse(X): + X_mean, X_std = np.mean(X, axis=0), np.std(X, axis=0) + X -= X_mean + X /= X_std + else: + warnings.warn('Normalize option doens''t support' + ' sparse matrix') + self.raw_coef_ = liblinear.train_wrap(X, y, sp.isspmatrix(X), self._get_solver_type(), @@ -696,6 +709,10 @@ def fit(self, X, y): self.coef_ = self.raw_coef_ self.intercept_ = 0. + if self.normalize and not sp.issparse(X): + self.coef_ = self.coef_ / X_std + self.intercept_ = self.intercept_ - np.dot(X_mean, self.coef_.T) + if self.multi_class == "crammer_singer" and len(self.classes_) == 2: self.coef_ = (self.coef_[1] - self.coef_[0]).reshape(1, -1) if self.fit_intercept: