From 75e886ea8a2660cab2ce1148781ef03adfa29a04 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 30 Aug 2016 16:35:59 +0200 Subject: [PATCH] Add stopping criteria --- doc/whats_new.rst | 1 + .../edited_nearest_neighbours.py | 38 +++++++++++++++++-- imblearn/under_sampling/tests/test_allknn.py | 6 +-- 3 files changed, 39 insertions(+), 6 deletions(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 5db36a918..f0ec11c85 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -17,6 +17,7 @@ Bug fixes - Fixed a bug in :class:`under_sampling.NearMiss` which was not picking the right samples during under sampling for the method 3. By `Guillaume Lemaitre`_. - Fixed a bug in :class:`ensemble.EasyEnsemble`, correction of the `random_state` generation. By `Guillaume Lemaitre`_ and `Christos Aridas`_. +- Fixed a bug in :class:`under_sampling.AllKNN`, add stopping criteria to avoid that the minority class become a majority class or that a class disappear. By `Guillaume Lemaitre`_. New features ~~~~~~~~~~~~ diff --git a/imblearn/under_sampling/edited_nearest_neighbours.py b/imblearn/under_sampling/edited_nearest_neighbours.py index 270daac67..b56954a01 100644 --- a/imblearn/under_sampling/edited_nearest_neighbours.py +++ b/imblearn/under_sampling/edited_nearest_neighbours.py @@ -537,10 +537,42 @@ def _sample(self, X, y): # updating ENN size_ngh self.enn_.size_ngh = curr_size_ngh if self.return_indices: - X_, y_, idx_ = self.enn_.fit_sample(X_, y_) - idx_under = idx_under[idx_] + X_enn, y_enn, idx_enn = self.enn_.fit_sample(X_, y_) else: - X_, y_ = self.enn_.fit_sample(X_, y_) + X_enn, y_enn = self.enn_.fit_sample(X_, y_) + + # Check the stopping criterion + # 1. If the number of samples in the other class become inferior to + # the number of samples in the majority class + # 2. If one of the class is disappearing + # Case 1 + stats_enn = Counter(y_enn) + self.logger.debug('Current ENN stats: %s', stats_enn) + # Get the number of samples in the non-minority classes + count_non_min = np.array([val for val, key + in zip(stats_enn.itervalues(), + stats_enn.iterkeys()) + if key != self.min_c_]) + self.logger.debug('Number of samples in the non-majority' + ' classes: %s', count_non_min) + # Check the minority stop to be the minority + b_min_bec_maj = np.any(count_non_min < self.stats_c_[self.min_c_]) + + # Case 2 + b_remove_maj_class = (len(stats_enn) < len(self.stats_c_)) + + if b_min_bec_maj or b_remove_maj_class: + # Log the variables to explain the stop of the algorithm + self.logger.debug('AllKNN minority become majority: %s', + b_min_bec_maj) + self.logger.debug('AllKNN remove one class: %s', + b_remove_maj_class) + break + + # Update the data for the next iteration + X_, y_, = X_enn, y_enn + if self.return_indices: + idx_under = idx_under[idx_enn] self.logger.info('Under-sampling performed: %s', Counter(y_)) diff --git a/imblearn/under_sampling/tests/test_allknn.py b/imblearn/under_sampling/tests/test_allknn.py index 694df5e00..43ca64761 100644 --- a/imblearn/under_sampling/tests/test_allknn.py +++ b/imblearn/under_sampling/tests/test_allknn.py @@ -155,6 +155,6 @@ def test_multiclass_fit_sample(): # Check the size of y count_y_res = Counter(y_resampled) - assert_equal(count_y_res[0], 341) - assert_equal(count_y_res[1], 2485) - assert_equal(count_y_res[2], 212) + assert_equal(count_y_res[0], 400) + assert_equal(count_y_res[1], 3600) + assert_equal(count_y_res[2], 1000)