-
Notifications
You must be signed in to change notification settings - Fork 1.3k
/
_neighbourhood_cleaning_rule.py
201 lines (165 loc) · 7.05 KB
/
_neighbourhood_cleaning_rule.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
"""Class performing under-sampling based on the neighbourhood cleaning rule."""
# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
# Christos Aridas
# License: MIT
from collections import Counter
import numpy as np
from scipy.stats import mode
from sklearn.utils import _safe_indexing
from ..base import BaseCleaningSampler
from ._edited_nearest_neighbours import EditedNearestNeighbours
from ...utils import check_neighbors_object
from ...utils import Substitution
from ...utils._docstring import _n_jobs_docstring
from ...utils._validation import _deprecate_positional_args
SEL_KIND = ("all", "mode")
@Substitution(
sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring,
n_jobs=_n_jobs_docstring,
)
class NeighbourhoodCleaningRule(BaseCleaningSampler):
"""Undersample based on the neighbourhood cleaning rule.
This class uses ENN and a k-NN to remove noisy samples from the dataset.
Read more in the :ref:`User Guide <condensed_nearest_neighbors>`.
Parameters
----------
{sampling_strategy}
n_neighbors : int or estimator object, default=3
If ``int``, size of the neighbourhood to consider to compute the
nearest neighbors. If object, an estimator that inherits from
:class:`~sklearn.neighbors.base.KNeighborsMixin` that will be used to
find the nearest-neighbors. By default, it explores the 3 closest
neighbors.
kind_sel : {{"all", "mode"}}, default='all'
Strategy to use in order to exclude samples in the ENN sampling.
- If ``'all'``, all neighbours will have to agree with a sample in order
not to be excluded.
- If ``'mode'``, the majority of the neighbours will have to agree with
a sample in order not to be excluded.
The strategy `"all"` will be less conservative than `'mode'`. Thus,
more samples will be removed when `kind_sel="all"`, generally.
Note that this parameter only applies to the cleaning step of the NCL.
The ENN is done using majority vote, as described in the original article.
#TODO: this is not the originally described threshold, fix
threshold_cleaning : float, default=0.5
Threshold used to whether consider a class or not during the cleaning
after applying ENN. A class will be considered during cleaning when:
Ci > C x T ,
where Ci and C is the number of samples in the class and the data set,
respectively and theta is the threshold.
{n_jobs}
Attributes
----------
sample_indices_ : ndarray of shape (n_new_samples,)
Indices of the samples selected.
.. versionadded:: 0.4
See Also
--------
EditedNearestNeighbours : Undersample by editing noisy samples.
Notes
-----
See the original paper: [1]_.
Supports multi-class resampling. A one-vs.-rest scheme is used when
sampling a class as proposed in [1]_.
References
----------
.. [1] J. Laurikkala, "Improving identification of difficult small classes
by balancing class distribution," Springer Berlin Heidelberg, 2001.
Examples
--------
>>> from collections import Counter
>>> from sklearn.datasets import make_classification
>>> from imblearn.under_sampling import \
NeighbourhoodCleaningRule # doctest: +NORMALIZE_WHITESPACE
>>> X, y = make_classification(n_classes=2, class_sep=2,
... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0,
... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)
>>> print('Original dataset shape %s' % Counter(y))
Original dataset shape Counter({{1: 900, 0: 100}})
>>> ncr = NeighbourhoodCleaningRule()
>>> X_res, y_res = ncr.fit_resample(X, y)
>>> print('Resampled dataset shape %s' % Counter(y_res))
Resampled dataset shape Counter({{1: 877, 0: 100}})
"""
@_deprecate_positional_args
def __init__(
self,
*,
sampling_strategy="auto",
n_neighbors=3,
kind_sel="all",
threshold_cleaning=0.5,
n_jobs=None,
):
super().__init__(sampling_strategy=sampling_strategy)
self.n_neighbors = n_neighbors
self.kind_sel = kind_sel
self.threshold_cleaning = threshold_cleaning
self.n_jobs = n_jobs
def _validate_estimator(self):
"""Create the objects required by NCR."""
self.nn_ = check_neighbors_object(
"n_neighbors", self.n_neighbors, additional_neighbor=1
)
self.nn_.set_params(**{"n_jobs": self.n_jobs})
if self.kind_sel not in SEL_KIND:
raise NotImplementedError
if self.threshold_cleaning > 1 or self.threshold_cleaning < 0:
raise ValueError(
f"'threshold_cleaning' is a value between 0 and 1."
f" Got {self.threshold_cleaning} instead."
)
def _fit_resample(self, X, y):
self._validate_estimator()
enn = EditedNearestNeighbours(
sampling_strategy=self.sampling_strategy,
n_neighbors=self.n_neighbors,
kind_sel="mode",
n_jobs=self.n_jobs,
)
enn.fit_resample(X, y)
index_not_a1 = enn.sample_indices_
index_a1 = np.ones(y.shape, dtype=bool)
index_a1[index_not_a1] = False
index_a1 = np.flatnonzero(index_a1)
# clean the neighborhood
target_stats = Counter(y)
class_minority = min(target_stats, key=target_stats.get)
# compute which classes to consider for cleaning for the A2 group
classes_under_sample = [
c
for c, n_samples in target_stats.items()
if (
c in self.sampling_strategy_.keys()
and (n_samples > X.shape[0] * self.threshold_cleaning)
)
]
self.nn_.fit(X)
class_minority_indices = np.flatnonzero(y == class_minority)
X_class = _safe_indexing(X, class_minority_indices)
y_class = _safe_indexing(y, class_minority_indices)
nnhood_idx = self.nn_.kneighbors(X_class, return_distance=False)[:, 1:]
nnhood_label = y[nnhood_idx]
if self.kind_sel == "mode":
nnhood_label_majority, _ = mode(nnhood_label, axis=1)
nnhood_bool = np.ravel(nnhood_label_majority) == y_class
elif self.kind_sel == "all":
nnhood_label_majority = nnhood_label == class_minority
nnhood_bool = np.all(nnhood_label, axis=1)
else:
raise NotImplementedError
# compute a2 group
index_a2 = np.ravel(nnhood_idx[~nnhood_bool])
index_a2 = np.unique(
[index for index in index_a2 if y[index] in classes_under_sample]
)
union_a1_a2 = np.union1d(index_a1, index_a2).astype(int)
selected_samples = np.ones(y.shape, dtype=bool)
selected_samples[union_a1_a2] = False
self.sample_indices_ = np.flatnonzero(selected_samples)
return (
_safe_indexing(X, self.sample_indices_),
_safe_indexing(y, self.sample_indices_),
)
def _more_tags(self):
return {"sample_indices": True}