Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG+1] Add sample weights support to kernel density estimation (fix #4394) #10803

Merged
merged 19 commits into from
Jun 26, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
a78cfa6
Add sample weights support to kernel density estimation (fix #4394)
samronsin Mar 13, 2018
5615e51
Add tests for the equivalence of sample weights and repetition for in…
samronsin Mar 14, 2018
1b6dd8e
Fix integer division in new tests (Python 3)
samronsin Mar 14, 2018
92bd0f7
Add tests for neutrality of constant sample weights
samronsin Mar 18, 2018
276ff69
Add checks for sample_weight parameter (shape, positive values)
samronsin Mar 18, 2018
7a6113d
Add automatic conversion to ndarray for sample_weight
samronsin Mar 18, 2018
c2011a6
Add tests for sampling along with a minor change in the random genera…
samronsin Mar 26, 2018
ea60225
Address comments from review
samronsin Apr 14, 2018
38e4b0e
Make Travis happy
samronsin Apr 15, 2018
49f5a95
Introduce n_samples_test
samronsin Apr 16, 2018
2a26843
Use check_consistent_length to check length consistency
samronsin May 28, 2018
ab6d4ad
Define sum_weight in all cases and get rid of redundant variable Z
samronsin May 29, 2018
8ff81e5
Improve maintability by simplifying branching at the cost of ~10-15% …
samronsin Jun 3, 2018
452333b
Add inline function to compute the total weight of a node
samronsin Jun 4, 2018
45c5289
Shrink size of tests (5x speedup)
samronsin Jun 4, 2018
0ac4bf3
Address misc suggestions
samronsin Jun 4, 2018
3014ecd
Add test for negative values of sample_weights
samronsin Jun 26, 2018
2b9c007
Update whats_new.rst
samronsin Jun 26, 2018
3aa1dee
Merge branch 'master' into sample-weights-in-KDE
jnothman Jun 26, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions doc/whats_new/v0.20.rst
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,11 @@ Classifiers and regressors
only require X to be an object with finite length or shape.
:issue:`9832` by :user:`Vrishank Bhardwaj <vrishank97>`.

- Add `sample_weight` parameter to the fit method of
:class:`neighbors.KernelDensity` to enables weighting in kernel density
estimation.
:issue:`4394` by :user:`Samuel O. Ronsin <samronsin>`.

- :class:`neighbors.RadiusNeighborsRegressor` and
:class:`neighbors.RadiusNeighborsClassifier` are now
parallelized according to ``n_jobs`` regardless of ``algorithm``.
Expand Down
29 changes: 23 additions & 6 deletions sklearn/neighbors/ball_tree.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -62,17 +62,34 @@ cdef int init_node(BinaryTree tree, ITYPE_t i_node,
cdef DTYPE_t* data = &tree.data[0, 0]
cdef DTYPE_t* centroid = &tree.node_bounds[0, i_node, 0]

cdef bint with_sample_weight = tree.sample_weight is not None
cdef DTYPE_t* sample_weight
cdef DTYPE_t sum_weight_node
if with_sample_weight:
sample_weight = &tree.sample_weight[0]

# determine Node centroid
for j in range(n_features):
centroid[j] = 0

for i in range(idx_start, idx_end):
this_pt = data + n_features * idx_array[i]
for j from 0 <= j < n_features:
centroid[j] += this_pt[j]
if with_sample_weight:
sum_weight_node = 0
for i in range(idx_start, idx_end):
sum_weight_node += sample_weight[idx_array[i]]
this_pt = data + n_features * idx_array[i]
for j from 0 <= j < n_features:
centroid[j] += this_pt[j] * sample_weight[idx_array[i]]

for j in range(n_features):
centroid[j] /= n_points
for j in range(n_features):
centroid[j] /= sum_weight_node
else:
for i in range(idx_start, idx_end):
this_pt = data + n_features * idx_array[i]
for j from 0 <= j < n_features:
centroid[j] += this_pt[j]

for j in range(n_features):
centroid[j] /= n_points

# determine Node radius
radius = 0
Expand Down
103 changes: 88 additions & 15 deletions sklearn/neighbors/binary_tree.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -1008,11 +1008,14 @@ VALID_METRIC_IDS = get_valid_metric_ids(VALID_METRICS)
cdef class BinaryTree:

cdef np.ndarray data_arr
cdef np.ndarray sample_weight_arr
cdef np.ndarray idx_array_arr
cdef np.ndarray node_data_arr
cdef np.ndarray node_bounds_arr

cdef readonly DTYPE_t[:, ::1] data
cdef readonly DTYPE_t[::1] sample_weight
cdef public DTYPE_t sum_weight
cdef public ITYPE_t[::1] idx_array
cdef public NodeData_t[::1] node_data
cdef public DTYPE_t[:, :, ::1] node_bounds
Expand All @@ -1036,11 +1039,13 @@ cdef class BinaryTree:
# errors and seg-faults in rare cases where __init__ is not called
def __cinit__(self):
self.data_arr = np.empty((1, 1), dtype=DTYPE, order='C')
self.sample_weight_arr = np.empty(1, dtype=DTYPE, order='C')
self.idx_array_arr = np.empty(1, dtype=ITYPE, order='C')
self.node_data_arr = np.empty(1, dtype=NodeData, order='C')
self.node_bounds_arr = np.empty((1, 1, 1), dtype=DTYPE)

self.data = get_memview_DTYPE_2D(self.data_arr)
self.sample_weight = get_memview_DTYPE_1D(self.sample_weight_arr)
self.idx_array = get_memview_ITYPE_1D(self.idx_array_arr)
self.node_data = get_memview_NodeData_1D(self.node_data_arr)
self.node_bounds = get_memview_DTYPE_3D(self.node_bounds_arr)
Expand All @@ -1057,10 +1062,11 @@ cdef class BinaryTree:
self.n_calls = 0

def __init__(self, data,
leaf_size=40, metric='minkowski', **kwargs):
leaf_size=40, metric='minkowski', sample_weight=None, **kwargs):
self.data_arr = np.asarray(data, dtype=DTYPE, order='C')
self.data = get_memview_DTYPE_2D(self.data_arr)


self.leaf_size = leaf_size
self.dist_metric = DistanceMetric.get_metric(metric, **kwargs)
self.euclidean = (self.dist_metric.__class__.__name__
Expand All @@ -1082,6 +1088,16 @@ cdef class BinaryTree:
n_samples = self.data.shape[0]
n_features = self.data.shape[1]


if sample_weight is not None:
self.sample_weight_arr = np.asarray(sample_weight, dtype=DTYPE, order='C')
self.sample_weight = get_memview_DTYPE_1D(self.sample_weight_arr)
self.sum_weight = np.sum(self.sample_weight)
else:
self.sample_weight = None
self.sum_weight = <DTYPE_t> n_samples


# determine number of levels in the tree, and from this
# the number of nodes in the tree. This results in leaf nodes
# with numbers of points between leaf_size and 2 * leaf_size
Expand Down Expand Up @@ -1654,10 +1670,10 @@ cdef class BinaryTree:
for i in range(Xarr.shape[0]):
min_max_dist(self, 0, pt, &dist_LB, &dist_UB)
# compute max & min bounds on density within top node
log_min_bound = (log(n_samples) +
log_min_bound = (log(self.sum_weight) +
compute_log_kernel(dist_UB,
h_c, kernel_c))
log_max_bound = (log(n_samples) +
log_max_bound = (log(self.sum_weight) +
compute_log_kernel(dist_LB,
h_c, kernel_c))
log_bound_spread = logsubexp(log_max_bound, log_min_bound)
Expand Down Expand Up @@ -2124,14 +2140,24 @@ cdef class BinaryTree:
# keep track of the global bounds on density. The procedure here is
# to split nodes, updating these bounds, until the bounds are within
# atol & rtol.
cdef ITYPE_t i, i1, i2, N1, N2, i_node
cdef ITYPE_t i, i1, i2, i_node
cdef DTYPE_t N1, N2
cdef DTYPE_t global_log_min_bound, global_log_bound_spread
cdef DTYPE_t global_log_max_bound

cdef DTYPE_t* data = &self.data[0, 0]
cdef bint with_sample_weight = self.sample_weight is not None
cdef DTYPE_t* sample_weight
if with_sample_weight:
sample_weight = &self.sample_weight[0]
cdef ITYPE_t* idx_array = &self.idx_array[0]
cdef NodeData_t* node_data = &self.node_data[0]
cdef ITYPE_t N = self.data.shape[0]
cdef DTYPE_t N
cdef DTYPE_t log_weight
if with_sample_weight:
N = self.sum_weight
else:
N = <DTYPE_t> self.data.shape[0]
cdef ITYPE_t n_features = self.data.shape[1]

cdef NodeData_t node_info
Expand Down Expand Up @@ -2163,7 +2189,11 @@ cdef class BinaryTree:
i_node = nodeheap_item.i1

node_info = node_data[i_node]
N1 = node_info.idx_end - node_info.idx_start
if with_sample_weight:
N1 = _total_node_weight(node_data, sample_weight,
idx_array, i_node)
else:
N1 = node_info.idx_end - node_info.idx_start

#------------------------------------------------------------
# Case 1: local bounds are equal to within per-point tolerance.
Expand Down Expand Up @@ -2192,17 +2222,27 @@ cdef class BinaryTree:
dist_pt = self.dist(pt, data + n_features * idx_array[i],
n_features)
log_density = compute_log_kernel(dist_pt, h, kernel)
if with_sample_weight:
log_weight = np.log(sample_weight[idx_array[i]])
else:
log_weight = 0.
global_log_min_bound = logaddexp(global_log_min_bound,
log_density)
log_density + log_weight)

#------------------------------------------------------------
# Case 4: split node and query subnodes
else:
i1 = 2 * i_node + 1
i2 = 2 * i_node + 2

N1 = node_data[i1].idx_end - node_data[i1].idx_start
N2 = node_data[i2].idx_end - node_data[i2].idx_start
if with_sample_weight:
N1 = _total_node_weight(node_data, sample_weight,
idx_array, i1)
N2 = _total_node_weight(node_data, sample_weight,
idx_array, i2)
else:
N1 = node_data[i1].idx_end - node_data[i1].idx_start
N2 = node_data[i2].idx_end - node_data[i2].idx_start

min_max_dist(self, i1, pt, &dist_LB_1, &dist_UB_1)
min_max_dist(self, i2, pt, &dist_LB_2, &dist_UB_2)
Expand Down Expand Up @@ -2264,9 +2304,16 @@ cdef class BinaryTree:
# global_min_bound and global_max_bound give the minimum and maximum
# density over the entire tree. We recurse down until global_min_bound
# and global_max_bound are within rtol and atol.
cdef ITYPE_t i, i1, i2, N1, N2
cdef ITYPE_t i, i1, i2, iw, start, end
cdef DTYPE_t N1, N2

cdef DTYPE_t* data = &self.data[0, 0]
cdef NodeData_t* node_data = &self.node_data[0]
cdef bint with_sample_weight = self.sample_weight is not None
cdef DTYPE_t* sample_weight
cdef DTYPE_t log_weight
if with_sample_weight:
sample_weight = &self.sample_weight[0]
cdef ITYPE_t* idx_array = &self.idx_array[0]
cdef ITYPE_t n_features = self.data.shape[1]

Expand All @@ -2277,8 +2324,13 @@ cdef class BinaryTree:
cdef DTYPE_t child1_log_bound_spread, child2_log_bound_spread
cdef DTYPE_t dist_UB = 0, dist_LB = 0

N1 = node_info.idx_end - node_info.idx_start
N2 = self.data.shape[0]
if with_sample_weight:
N1 = _total_node_weight(node_data, sample_weight,
idx_array, i_node)
N2 = self.sum_weight
else:
N1 = <DTYPE_t>(node_info.idx_end - node_info.idx_start)
N2 = <DTYPE_t>self.data.shape[0]

#------------------------------------------------------------
# Case 1: local bounds are equal to within errors. Return
Expand All @@ -2305,17 +2357,28 @@ cdef class BinaryTree:
dist_pt = self.dist(pt, (data + n_features * idx_array[i]),
n_features)
log_dens_contribution = compute_log_kernel(dist_pt, h, kernel)
if with_sample_weight:
log_weight = np.log(sample_weight[idx_array[i]])
else:
log_weight = 0.
global_log_min_bound[0] = logaddexp(global_log_min_bound[0],
log_dens_contribution)
(log_dens_contribution +
log_weight))

#------------------------------------------------------------
# Case 4: split node and query subnodes
else:
i1 = 2 * i_node + 1
i2 = 2 * i_node + 2

N1 = self.node_data[i1].idx_end - self.node_data[i1].idx_start
N2 = self.node_data[i2].idx_end - self.node_data[i2].idx_start
if with_sample_weight:
N1 = _total_node_weight(node_data, sample_weight,
idx_array, i1)
N2 = _total_node_weight(node_data, sample_weight,
idx_array, i2)
else:
N1 = <DTYPE_t>(self.node_data[i1].idx_end - self.node_data[i1].idx_start)
N2 = <DTYPE_t>(self.node_data[i2].idx_end - self.node_data[i2].idx_start)

min_max_dist(self, i1, pt, &dist_LB, &dist_UB)
child1_log_min_bound = log(N1) + compute_log_kernel(dist_UB, h,
Expand Down Expand Up @@ -2540,3 +2603,13 @@ cdef inline double fmin(double a, double b):

cdef inline double fmax(double a, double b) nogil:
return max(a, b)

cdef inline DTYPE_t _total_node_weight(NodeData_t* node_data,
DTYPE_t* sample_weight,
ITYPE_t* idx_array,
ITYPE_t i_node):
cdef ITYPE_t i
cdef DTYPE_t N = 0.0
for i in range(node_data[i_node].idx_start, node_data[i_node].idx_end):
N += sample_weight[idx_array[i]]
return N
33 changes: 28 additions & 5 deletions sklearn/neighbors/kde.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
import numpy as np
from scipy.special import gammainc
from ..base import BaseEstimator
from ..utils import check_array, check_random_state
from ..utils import check_array, check_random_state, check_consistent_length

from ..utils.extmath import row_norms
from .ball_tree import BallTree, DTYPE
from .kd_tree import KDTree
Expand Down Expand Up @@ -112,23 +113,37 @@ def _choose_algorithm(self, algorithm, metric):
else:
raise ValueError("invalid algorithm: '{0}'".format(algorithm))

def fit(self, X, y=None):
def fit(self, X, y=None, sample_weight=None):
"""Fit the Kernel Density model on the data.

Parameters
----------
X : array_like, shape (n_samples, n_features)
List of n_features-dimensional data points. Each row
corresponds to a single data point.
sample_weight: array_like, shape (n_samples,), optional
List of sample weights attached to the data X.
"""
algorithm = self._choose_algorithm(self.algorithm, self.metric)
X = check_array(X, order='C', dtype=DTYPE)

if sample_weight is not None:
sample_weight = check_array(sample_weight, order='C', dtype=DTYPE,
ensure_2d=False)
if sample_weight.ndim != 1:
raise ValueError("the shape of sample_weight must be ({0},),"
" but was {1}".format(X.shape[0],
sample_weight.shape))
check_consistent_length(X, sample_weight)
if sample_weight.min() <= 0:
raise ValueError("sample_weight must have positive values")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not covered by the tests

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bien vu ! Added a test for that as well.


kwargs = self.metric_params
if kwargs is None:
kwargs = {}
self.tree_ = TREE_DICT[algorithm](X, metric=self.metric,
leaf_size=self.leaf_size,
sample_weight=sample_weight,
**kwargs)
return self

Expand All @@ -150,7 +165,10 @@ def score_samples(self, X):
# For it to be a probability, we must scale it. For this reason
# we'll also scale atol.
X = check_array(X, order='C', dtype=DTYPE)
N = self.tree_.data.shape[0]
if self.tree_.sample_weight is None:
N = self.tree_.data.shape[0]
else:
N = self.tree_.sum_weight
atol_N = self.atol * N
log_density = self.tree_.kernel_density(
X, h=self.bandwidth, kernel=self.kernel, atol=atol_N,
Expand Down Expand Up @@ -202,8 +220,13 @@ def sample(self, n_samples=1, random_state=None):
data = np.asarray(self.tree_.data)

rng = check_random_state(random_state)
i = rng.randint(data.shape[0], size=n_samples)

u = rng.uniform(0, 1, size=n_samples)
if self.tree_.sample_weight is None:
i = (u * data.shape[0]).astype(np.int64)
else:
cumsum_weight = np.cumsum(np.asarray(self.tree_.sample_weight))
sum_weight = cumsum_weight[-1]
i = np.searchsorted(cumsum_weight, u * sum_weight)
if self.kernel == 'gaussian':
return np.atleast_2d(rng.normal(data[i], self.bandwidth))

Expand Down
Loading