scikit-learn · jnothman · Jun 26, 2018 · Mar 13, 2018 · Mar 14, 2018 · Mar 14, 2018
diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst
@@ -206,6 +206,11 @@ Classifiers and regressors
   only require X to be an object with finite length or shape.
   :issue:`9832` by :user:`Vrishank Bhardwaj <vrishank97>`.
 
+- Add `sample_weight` parameter to the fit method of
+  :class:`neighbors.KernelDensity` to enables weighting in kernel density
+  estimation.
+  :issue:`4394` by :user:`Samuel O. Ronsin <samronsin>`.
+
 - :class:`neighbors.RadiusNeighborsRegressor` and
   :class:`neighbors.RadiusNeighborsClassifier` are now
   parallelized according to ``n_jobs`` regardless of ``algorithm``.

diff --git a/sklearn/neighbors/ball_tree.pyx b/sklearn/neighbors/ball_tree.pyx
@@ -62,17 +62,34 @@ cdef int init_node(BinaryTree tree, ITYPE_t i_node,
     cdef DTYPE_t* data = &tree.data[0, 0]
     cdef DTYPE_t* centroid = &tree.node_bounds[0, i_node, 0]
 
+    cdef bint with_sample_weight = tree.sample_weight is not None
+    cdef DTYPE_t* sample_weight
+    cdef DTYPE_t sum_weight_node
+    if with_sample_weight:
+        sample_weight = &tree.sample_weight[0]
+
     # determine Node centroid
     for j in range(n_features):
         centroid[j] = 0
 
-    for i in range(idx_start, idx_end):
-        this_pt = data + n_features * idx_array[i]
-        for j from 0 <= j < n_features:
-            centroid[j] += this_pt[j]
+    if with_sample_weight:
+        sum_weight_node = 0
+        for i in range(idx_start, idx_end):
+            sum_weight_node += sample_weight[idx_array[i]]
+            this_pt = data + n_features * idx_array[i]
+            for j from 0 <= j < n_features:
+                centroid[j] += this_pt[j] * sample_weight[idx_array[i]]
 
-    for j in range(n_features):
-        centroid[j] /= n_points
+        for j in range(n_features):
+            centroid[j] /= sum_weight_node
+    else:
+        for i in range(idx_start, idx_end):
+            this_pt = data + n_features * idx_array[i]
+            for j from 0 <= j < n_features:
+                centroid[j] += this_pt[j]
+
+        for j in range(n_features):
+            centroid[j] /= n_points
 
     # determine Node radius
     radius = 0

diff --git a/sklearn/neighbors/binary_tree.pxi b/sklearn/neighbors/binary_tree.pxi
@@ -1008,11 +1008,14 @@ VALID_METRIC_IDS = get_valid_metric_ids(VALID_METRICS)
 cdef class BinaryTree:
 
     cdef np.ndarray data_arr
+    cdef np.ndarray sample_weight_arr
     cdef np.ndarray idx_array_arr
     cdef np.ndarray node_data_arr
     cdef np.ndarray node_bounds_arr
 
     cdef readonly DTYPE_t[:, ::1] data
+    cdef readonly DTYPE_t[::1] sample_weight
+    cdef public DTYPE_t sum_weight
     cdef public ITYPE_t[::1] idx_array
     cdef public NodeData_t[::1] node_data
     cdef public DTYPE_t[:, :, ::1] node_bounds
@@ -1036,11 +1039,13 @@ cdef class BinaryTree:
     # errors and seg-faults in rare cases where __init__ is not called
     def __cinit__(self):
         self.data_arr = np.empty((1, 1), dtype=DTYPE, order='C')
+        self.sample_weight_arr = np.empty(1, dtype=DTYPE, order='C')
         self.idx_array_arr = np.empty(1, dtype=ITYPE, order='C')
         self.node_data_arr = np.empty(1, dtype=NodeData, order='C')
         self.node_bounds_arr = np.empty((1, 1, 1), dtype=DTYPE)
 
         self.data = get_memview_DTYPE_2D(self.data_arr)
+        self.sample_weight = get_memview_DTYPE_1D(self.sample_weight_arr)
         self.idx_array = get_memview_ITYPE_1D(self.idx_array_arr)
         self.node_data = get_memview_NodeData_1D(self.node_data_arr)
         self.node_bounds = get_memview_DTYPE_3D(self.node_bounds_arr)
@@ -1057,10 +1062,11 @@ cdef class BinaryTree:
         self.n_calls = 0
 
     def __init__(self, data,
-                 leaf_size=40, metric='minkowski', **kwargs):
+                 leaf_size=40, metric='minkowski', sample_weight=None, **kwargs):
         self.data_arr = np.asarray(data, dtype=DTYPE, order='C')
         self.data = get_memview_DTYPE_2D(self.data_arr)
 
+
         self.leaf_size = leaf_size
         self.dist_metric = DistanceMetric.get_metric(metric, **kwargs)
         self.euclidean = (self.dist_metric.__class__.__name__
@@ -1082,6 +1088,16 @@ cdef class BinaryTree:
         n_samples = self.data.shape[0]
         n_features = self.data.shape[1]
 
+
+        if sample_weight is not None:
+            self.sample_weight_arr = np.asarray(sample_weight, dtype=DTYPE, order='C')
+            self.sample_weight = get_memview_DTYPE_1D(self.sample_weight_arr)
+            self.sum_weight = np.sum(self.sample_weight)
+        else:
+            self.sample_weight = None
+            self.sum_weight = <DTYPE_t> n_samples
+
+
         # determine number of levels in the tree, and from this
         # the number of nodes in the tree.  This results in leaf nodes
         # with numbers of points between leaf_size and 2 * leaf_size
@@ -1654,10 +1670,10 @@ cdef class BinaryTree:
             for i in range(Xarr.shape[0]):
                 min_max_dist(self, 0, pt, &dist_LB, &dist_UB)
                 # compute max & min bounds on density within top node
-                log_min_bound = (log(n_samples) +
+                log_min_bound = (log(self.sum_weight) +
                                  compute_log_kernel(dist_UB,
                                                     h_c, kernel_c))
-                log_max_bound = (log(n_samples) +
+                log_max_bound = (log(self.sum_weight) +
                                  compute_log_kernel(dist_LB,
                                                     h_c, kernel_c))
                 log_bound_spread = logsubexp(log_max_bound, log_min_bound)
@@ -2124,14 +2140,24 @@ cdef class BinaryTree:
         # keep track of the global bounds on density.  The procedure here is
         # to split nodes, updating these bounds, until the bounds are within
         # atol & rtol.
-        cdef ITYPE_t i, i1, i2, N1, N2, i_node
+        cdef ITYPE_t i, i1, i2, i_node
+        cdef DTYPE_t N1, N2
         cdef DTYPE_t global_log_min_bound, global_log_bound_spread
         cdef DTYPE_t global_log_max_bound
 
         cdef DTYPE_t* data = &self.data[0, 0]
+        cdef bint with_sample_weight = self.sample_weight is not None
+        cdef DTYPE_t* sample_weight
+        if with_sample_weight:
+            sample_weight = &self.sample_weight[0]
         cdef ITYPE_t* idx_array = &self.idx_array[0]
         cdef NodeData_t* node_data = &self.node_data[0]
-        cdef ITYPE_t N = self.data.shape[0]
+        cdef DTYPE_t N
+        cdef DTYPE_t log_weight
+        if with_sample_weight:
+            N = self.sum_weight
+        else:
+            N = <DTYPE_t> self.data.shape[0]
         cdef ITYPE_t n_features = self.data.shape[1]
 
         cdef NodeData_t node_info
@@ -2163,7 +2189,11 @@ cdef class BinaryTree:
             i_node = nodeheap_item.i1
 
             node_info = node_data[i_node]
-            N1 = node_info.idx_end - node_info.idx_start
+            if with_sample_weight:
+                N1 = _total_node_weight(node_data, sample_weight,
+                                        idx_array, i_node)
+            else:
+                N1 = node_info.idx_end - node_info.idx_start
 
             #------------------------------------------------------------
             # Case 1: local bounds are equal to within per-point tolerance.
@@ -2192,17 +2222,27 @@ cdef class BinaryTree:
                     dist_pt = self.dist(pt, data + n_features * idx_array[i],
                                         n_features)
                     log_density = compute_log_kernel(dist_pt, h, kernel)
+                    if with_sample_weight:
+                        log_weight = np.log(sample_weight[idx_array[i]])
+                    else:
+                        log_weight = 0.
                     global_log_min_bound = logaddexp(global_log_min_bound,
-                                                     log_density)
+                                                     log_density + log_weight)
 
             #------------------------------------------------------------
             # Case 4: split node and query subnodes
             else:
                 i1 = 2 * i_node + 1
                 i2 = 2 * i_node + 2
 
-                N1 = node_data[i1].idx_end - node_data[i1].idx_start
-                N2 = node_data[i2].idx_end - node_data[i2].idx_start
+                if with_sample_weight:
+                    N1 = _total_node_weight(node_data, sample_weight,
+                                            idx_array, i1)
+                    N2 = _total_node_weight(node_data, sample_weight,
+                                            idx_array, i2)
+                else:
+                    N1 = node_data[i1].idx_end - node_data[i1].idx_start
+                    N2 = node_data[i2].idx_end - node_data[i2].idx_start
 
                 min_max_dist(self, i1, pt, &dist_LB_1, &dist_UB_1)
                 min_max_dist(self, i2, pt, &dist_LB_2, &dist_UB_2)
@@ -2264,9 +2304,16 @@ cdef class BinaryTree:
         # global_min_bound and global_max_bound give the minimum and maximum
         # density over the entire tree.  We recurse down until global_min_bound
         # and global_max_bound are within rtol and atol.
-        cdef ITYPE_t i, i1, i2, N1, N2
+        cdef ITYPE_t i, i1, i2, iw, start, end
+        cdef DTYPE_t N1, N2
 
         cdef DTYPE_t* data = &self.data[0, 0]
+        cdef NodeData_t* node_data = &self.node_data[0]
+        cdef bint with_sample_weight = self.sample_weight is not None
+        cdef DTYPE_t* sample_weight
+        cdef DTYPE_t log_weight
+        if with_sample_weight:
+            sample_weight = &self.sample_weight[0]
         cdef ITYPE_t* idx_array = &self.idx_array[0]
         cdef ITYPE_t n_features = self.data.shape[1]
 
@@ -2277,8 +2324,13 @@ cdef class BinaryTree:
         cdef DTYPE_t child1_log_bound_spread, child2_log_bound_spread
         cdef DTYPE_t dist_UB = 0, dist_LB = 0
 
-        N1 = node_info.idx_end - node_info.idx_start
-        N2 = self.data.shape[0]
+        if with_sample_weight:
+            N1  = _total_node_weight(node_data, sample_weight,
+                                     idx_array, i_node)
+            N2 = self.sum_weight
+        else:
+            N1 = <DTYPE_t>(node_info.idx_end - node_info.idx_start)
+            N2 = <DTYPE_t>self.data.shape[0]
 
         #------------------------------------------------------------
         # Case 1: local bounds are equal to within errors.  Return
@@ -2305,17 +2357,28 @@ cdef class BinaryTree:
                 dist_pt = self.dist(pt, (data + n_features * idx_array[i]),
                                     n_features)
                 log_dens_contribution = compute_log_kernel(dist_pt, h, kernel)
+                if with_sample_weight:
+                    log_weight = np.log(sample_weight[idx_array[i]])
+                else:
+                    log_weight = 0.
                 global_log_min_bound[0] = logaddexp(global_log_min_bound[0],
-                                                    log_dens_contribution)
+                                                    (log_dens_contribution +
+                                                     log_weight))
 
         #------------------------------------------------------------
         # Case 4: split node and query subnodes
         else:
             i1 = 2 * i_node + 1
             i2 = 2 * i_node + 2
 
-            N1 = self.node_data[i1].idx_end - self.node_data[i1].idx_start
-            N2 = self.node_data[i2].idx_end - self.node_data[i2].idx_start
+            if with_sample_weight:
+                N1 = _total_node_weight(node_data, sample_weight,
+                                        idx_array, i1)
+                N2 = _total_node_weight(node_data, sample_weight,
+                                        idx_array, i2)
+            else:
+                N1 = <DTYPE_t>(self.node_data[i1].idx_end - self.node_data[i1].idx_start)
+                N2 = <DTYPE_t>(self.node_data[i2].idx_end - self.node_data[i2].idx_start)
 
             min_max_dist(self, i1, pt, &dist_LB, &dist_UB)
             child1_log_min_bound = log(N1) + compute_log_kernel(dist_UB, h,
@@ -2540,3 +2603,13 @@ cdef inline double fmin(double a, double b):
 
 cdef inline double fmax(double a, double b) nogil:
     return max(a, b)
+
+cdef inline DTYPE_t _total_node_weight(NodeData_t* node_data,
+                                       DTYPE_t* sample_weight,
+                                       ITYPE_t* idx_array,
+                                       ITYPE_t i_node):
+    cdef ITYPE_t i
+    cdef DTYPE_t N = 0.0
+    for i in range(node_data[i_node].idx_start, node_data[i_node].idx_end):
+        N += sample_weight[idx_array[i]]
+    return N
diff --git a/sklearn/neighbors/kde.py b/sklearn/neighbors/kde.py
@@ -7,7 +7,8 @@
 import numpy as np
 from scipy.special import gammainc
 from ..base import BaseEstimator
-from ..utils import check_array, check_random_state
+from ..utils import check_array, check_random_state, check_consistent_length
+
 from ..utils.extmath import row_norms
 from .ball_tree import BallTree, DTYPE
 from .kd_tree import KDTree
@@ -112,23 +113,37 @@ def _choose_algorithm(self, algorithm, metric):
         else:
             raise ValueError("invalid algorithm: '{0}'".format(algorithm))
 
-    def fit(self, X, y=None):
+    def fit(self, X, y=None, sample_weight=None):
         """Fit the Kernel Density model on the data.
 
         Parameters
         ----------
         X : array_like, shape (n_samples, n_features)
             List of n_features-dimensional data points.  Each row
             corresponds to a single data point.
+        sample_weight: array_like, shape (n_samples,), optional
+            List of sample weights attached to the data X.
         """
         algorithm = self._choose_algorithm(self.algorithm, self.metric)
         X = check_array(X, order='C', dtype=DTYPE)
 
+        if sample_weight is not None:
+            sample_weight = check_array(sample_weight, order='C', dtype=DTYPE,
+                                        ensure_2d=False)
+            if sample_weight.ndim != 1:
+                raise ValueError("the shape of sample_weight must be ({0},),"
+                                 " but was {1}".format(X.shape[0],
+                                                       sample_weight.shape))
+            check_consistent_length(X, sample_weight)
+            if sample_weight.min() <= 0:
+                raise ValueError("sample_weight must have positive values")
+
         kwargs = self.metric_params
         if kwargs is None:
             kwargs = {}
         self.tree_ = TREE_DICT[algorithm](X, metric=self.metric,
                                           leaf_size=self.leaf_size,
+                                          sample_weight=sample_weight,
                                           **kwargs)
         return self
 
@@ -150,7 +165,10 @@ def score_samples(self, X):
         # For it to be a probability, we must scale it.  For this reason
         # we'll also scale atol.
         X = check_array(X, order='C', dtype=DTYPE)
-        N = self.tree_.data.shape[0]
+        if self.tree_.sample_weight is None:
+            N = self.tree_.data.shape[0]
+        else:
+            N = self.tree_.sum_weight
         atol_N = self.atol * N
         log_density = self.tree_.kernel_density(
             X, h=self.bandwidth, kernel=self.kernel, atol=atol_N,
@@ -202,8 +220,13 @@ def sample(self, n_samples=1, random_state=None):
         data = np.asarray(self.tree_.data)
 
         rng = check_random_state(random_state)
-        i = rng.randint(data.shape[0], size=n_samples)
-
+        u = rng.uniform(0, 1, size=n_samples)
+        if self.tree_.sample_weight is None:
+            i = (u * data.shape[0]).astype(np.int64)
+        else:
+            cumsum_weight = np.cumsum(np.asarray(self.tree_.sample_weight))
+            sum_weight = cumsum_weight[-1]
+            i = np.searchsorted(cumsum_weight, u * sum_weight)
         if self.kernel == 'gaussian':
             return np.atleast_2d(rng.normal(data[i], self.bandwidth))