Skip to content

Loading…

[WIP] sample_weight support in DummyRegressor #3429

Open
wants to merge 8 commits into from

1 participant

@RolT

Added weights support in for the DummyRegressor.
No tests done, not sure if it works yet.
For now, weighted and uniform strategies are separated, but I should merge it in the future.

@jnothman jnothman changed the title from [WIP] Weights to [WIP] sample_weight support in DummyRegressor
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Showing with 175 additions and 7 deletions.
  1. +41 −7 sklearn/dummy.py
  2. +35 −0 sklearn/tests/test_dummy.py
  3. +99 −0 sklearn/utils/weighted_quantiles.py
View
48 sklearn/dummy.py
@@ -11,9 +11,12 @@
from .utils import check_random_state
from .utils.validation import safe_asarray
from sklearn.utils import deprecated
+from scipy import stats
+from .utils import weighted_quantiles
class DummyClassifier(BaseEstimator, ClassifierMixin):
+
"""
DummyClassifier is a classifier that makes predictions using simple rules.
@@ -283,6 +286,7 @@ def predict_log_proba(self, X):
class DummyRegressor(BaseEstimator, RegressorMixin):
+
"""
DummyRegressor is a regressor that makes predictions using
simple rules.
@@ -299,11 +303,17 @@ class DummyRegressor(BaseEstimator, RegressorMixin):
* "median": always predicts the median of the training set
* "constant": always predicts a constant value that is provided by
the user.
+ * "quantile": always predict the quantile of the training set,
+ the value is provided by the user.
constant : int or float or array of shape = [n_outputs]
The explicit constant as predicted by the "constant" strategy. This
parameter is useful only for the "constant" strategy.
+ alpha : float, optional.
+ The parameter for the quantile strategy, ranging from 0 to 1.
+ For instance, alpha = 0.5 will calculate the median.
+
Attributes
----------
`constant_` : float or array of shape [n_outputs]
@@ -317,9 +327,10 @@ class DummyRegressor(BaseEstimator, RegressorMixin):
True if the output at fit is 2d, else false.
"""
- def __init__(self, strategy="mean", constant=None):
+ def __init__(self, strategy="mean", constant=None, alpha=None):
self.strategy = strategy
self.constant = constant
+ self.alpha = alpha
@property
@deprecated('This will be removed in version 0.17')
@@ -328,7 +339,7 @@ def y_mean_(self):
return self.constant_
raise AttributeError
- def fit(self, X, y):
+ def fit(self, X, y, sample_weight=None):
"""Fit the random regressor.
Parameters
@@ -340,25 +351,34 @@ def fit(self, X, y):
y : array-like, shape = [n_samples] or [n_samples, n_outputs]
Target values.
+ sample_weight : array-like of shape = [n_samples], optional
+ Sample weights.
+
Returns
-------
self : object
Returns self.
"""
- if self.strategy not in ("mean", "median", "constant"):
+ if self.strategy not in ("mean", "median", "constant", "quantile"):
raise ValueError("Unknown strategy type: %s, "
- "expected 'mean', 'median' or 'constant'"
- % self.strategy)
+ "expected 'mean', 'median', 'constant'"
+ "or 'quantile'" % self.strategy)
y = safe_asarray(y)
self.output_2d_ = (y.ndim == 2)
if self.strategy == "mean":
- self.constant_ = np.reshape(np.mean(y, axis=0), (1, -1))
+ self.constant_ = np.reshape(np.average(y, axis=0,
+ weights=sample_weight),
+ (1, -1))
elif self.strategy == "median":
- self.constant_ = np.reshape(np.median(y, axis=0), (1, -1))
+ if not sample_weight:
+ self.constant_ = np.reshape(np.median(y, axis=0), (1, -1))
+ else:
+ self.constant_ = np.reshape(
+ weighted_quantiles(y, sample_weight, 0.5), (1, -1))
elif self.strategy == "constant":
if self.constant is None:
@@ -374,6 +394,20 @@ def fit(self, X, y):
self.constant_ = np.reshape(self.constant, (1, -1))
+ elif self.strategy == "quantile":
+ if not 0 < self.alpha < 1.0:
+ raise ValueError("`alpha` must be in (0, 1.0) but was %r"
+ % self.alpha)
+ else:
+ if not sample_weight:
+ self.constant_ = np.reshape(
+ stats.scoreatpercentile(y, self.alpha * 100.0, axis=0),
+ (1, -1))
+ else:
+ self.constant_ = np.reshape(
+ weighted_quantiles(y, sample_weight, self.alpha),
+ (1, -1))
+
self.n_outputs_ = np.size(self.constant_) # y.shape[1] is not safe
return self
View
35 sklearn/tests/test_dummy.py
@@ -9,6 +9,7 @@
from sklearn.utils.testing import assert_raises
from sklearn.dummy import DummyClassifier, DummyRegressor
+from scipy import stats
def _check_predict_proba(clf, X, y):
@@ -397,3 +398,37 @@ def test_classification_sample_weight():
clf = DummyClassifier().fit(X, y, sample_weight)
assert_array_almost_equal(clf.class_prior_, [0.2 / 1.2, 1. / 1.2])
+
+
+def test_y_quantile_attribute_regressor():
+ X = [[0]] * 5
+ y = [1, 2, 4, 6, 8]
+ est = DummyRegressor(strategy='quantile', alpha=0.9)
+ est.fit(X, y)
+
+ assert_equal(est.constant_, stats.scoreatpercentile(y, 90.0))
+
+
+def test_quantile_strategy_multioutput_regressor():
+
+ random_state = np.random.RandomState(seed=1)
+
+ X_learn = random_state.randn(10, 10)
+ y_learn = random_state.randn(10, 5)
+
+ quantile = np.reshape(
+ stats.scoreatpercentile(y_learn, 80.0, axis=0),
+ (1, -1))
+
+ X_test = random_state.randn(20, 10)
+ y_test = random_state.randn(20, 5)
+
+ # Correctness oracle
+ est = DummyRegressor(strategy="quantile", alpha=0.8)
+ est.fit(X_learn, y_learn)
+ y_pred_learn = est.predict(X_learn)
+ y_pred_test = est.predict(X_test)
+
+ _check_equality_regressor(quantile, y_learn, y_pred_learn,
+ y_test, y_pred_test)
+ _check_behavior_2d(est)
View
99 sklearn/utils/weighted_quantiles.py
@@ -0,0 +1,99 @@
+"""
+Library to compute weighted quantiles, including the weighted median, of
+numpy arrays.
+
+Copyright (c) 2014 José Sabater
+https://github.com/nudomarinero/wquantiles
+
+Slightly modified by Roland Thiolliere
+"""
+from __future__ import print_function
+import numpy as np
+
+__version__ = "0.3"
+
+
+def quantile_1D(data, weights, quantile):
+ """
+ Compute the weighted quantile of a 1D numpy array.
+
+ Parameters
+ ----------
+ data : ndarray
+ Input array (one dimension).
+ weights : ndarray
+ Array with the weights of the same size of `data`.
+ quantile : float
+ Quantile to compute. It must have a value between 0 and 1.
+
+ Returns
+ -------
+ quantile_1D : float
+ The output value.
+ """
+ # Check the data
+ if not isinstance(data, np.matrix):
+ data = np.asarray(data)
+ if not isinstance(weights, np.matrix):
+ weights = np.asarray(weights)
+ nd = data.ndim
+ if nd != 1:
+ raise TypeError("data must be a one dimensional array")
+ ndw = weights.ndim
+ if ndw != 1:
+ raise TypeError("weights must be a one dimensional array")
+ if data.shape != weights.shape:
+ raise TypeError("the length of data and weights must be the same")
+ if ((quantile > 1.) or (quantile < 0.)):
+ raise ValueError("quantile must have a value between 0. and 1.")
+ # Sort the data
+ ind_sorted = np.argsort(data)
+ sorted_data = data[ind_sorted]
+ sorted_weights = weights[ind_sorted]
+ # Compute the auxiliary arrays
+ Sn = np.cumsum(sorted_weights)
+ # TODO: Check that the weights do not sum zero
+ Pn = (Sn-0.5*sorted_weights)/np.sum(sorted_weights)
+ # Get the value of the weighted median
+ return np.interp(quantile, Pn, sorted_data)
+
+
+def quantile(data, weights, quantile):
+ """
+ Weighted quantile of an array with respect to the last axis.
+
+ Parameters
+ ----------
+ data : ndarray
+ Input array.
+ weights : ndarray
+ Array with the weights. It must have the same size of the last
+ axis of `data`.
+ quantile : float
+ Quantile to compute. It must have a value between 0 and 1.
+
+ Returns
+ -------
+ quantile : float
+ The output value.
+ """
+ # TODO: Allow to specify the axis
+ nd = data.ndim
+ if nd == 0:
+ TypeError("data must have at least one dimension")
+ elif nd == 1:
+ return quantile_1D(data, weights, quantile)
+ elif nd > 1:
+ n = data.shape
+ imr = data.reshape((np.prod(n[:-1]), n[-1]))
+ result = np.apply_along_axis(quantile_1D, 0, imr, weights, quantile)
+ return result.reshape(n[:-1])
+
+
+def median(data, weights):
+ """
+ Weighted median of an array with respect to the last axis.
+
+ Alias for `quantile(data, weights, 0.5)`.
+ """
+ return quantile(data, weights, 0.5)
Something went wrong with that request. Please try again.