diff --git a/README.rst b/README.rst index 56978c95b..b37b272a8 100644 --- a/README.rst +++ b/README.rst @@ -106,6 +106,7 @@ Below is a list of the methods currently implemented in this module. 8. Edited Nearest Neighbours [6]_ 9. Instance Hardness Threshold [7]_ 10. Repeated Edited Nearest Neighbours [14]_ + 11. AllKNN [14]_ * Over-sampling 1. Random minority over-sampling with replacement diff --git a/doc/todo.rst b/doc/todo.rst index 3a3d4d5ba..fc15a23d8 100644 --- a/doc/todo.rst +++ b/doc/todo.rst @@ -12,9 +12,9 @@ Version 0.2 New methods ~~~~~~~~~~~ -* AIIKNN_: Garcia, Salvador, et al. "Prototype selection for nearest neighbor classification: Taxonomy and empirical study." IEEE Transactions on Pattern Analysis and Machine Intelligence 34.3 (2012): 417-435. +* SMOTEBagging_: Wang, Shuo, and Xin Yao. "Diversity analysis on imbalanced data sets by using ensemble models." Computational Intelligence and Data Mining, 2009. CIDM'09. IEEE Symposium on. IEEE, 2009. -.. _AIIKNN: https://www.semanticscholar.org/paper/Prototype-Selection-for-Nearest-Neighbor-Garc%C3%ADa-Derrac/fbca1824c49e02da37e5e780eaf0ab6ddfaf5614/pdf +.. _SMOTEBagging: http://pages.bangor.ac.uk/~mas00a/papers/jpjrcolkis15.pdf API improvements ~~~~~~~~~~~~~~~~ diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 4841842a8..ba2a79064 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -14,6 +14,7 @@ Changelog - Added support for bumpversion. - Added doctest in the documentation. +- Added AllKNN under sampling technique. .. _changes_0_1: diff --git a/examples/under-sampling/plot_allknn.py b/examples/under-sampling/plot_allknn.py new file mode 100644 index 000000000..8e7a0208e --- /dev/null +++ b/examples/under-sampling/plot_allknn.py @@ -0,0 +1,92 @@ +""" +================================== +AllKNN +================================== + +An illustration of the AllKNN method. + +""" + +print(__doc__) + +import matplotlib.pyplot as plt +import seaborn as sns +sns.set() + +# Define some color for the plotting +almost_black = '#262626' +palette = sns.color_palette() + +from sklearn.datasets import make_classification +from sklearn.decomposition import PCA + +from imblearn.under_sampling import EditedNearestNeighbours +from imblearn.under_sampling import RepeatedEditedNearestNeighbours +from imblearn.under_sampling import AllKNN + +# Generate the dataset +X, y = make_classification(n_classes=2, class_sep=1.25, weights=[0.3, 0.7], + n_informative=3, n_redundant=1, flip_y=0, + n_features=5, n_clusters_per_class=1, + n_samples=5000, random_state=10) + +# Instanciate a PCA object for the sake of easy visualisation +pca = PCA(n_components=2) +# Fit and transform x to visualise inside a 2D feature space +X_vis = pca.fit_transform(X) + +# Three subplots, unpack the axes array immediately +f, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4) + +ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=.5, + edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) +ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=.5, + edgecolor=almost_black, facecolor=palette[2], linewidth=0.15) +ax1.set_title('Original set') + +# Apply the ENN +print('ENN') +enn = EditedNearestNeighbours() +X_resampled, y_resampled = enn.fit_sample(X, y) +X_res_vis = pca.transform(X_resampled) +print('Reduced {:.2f}\%'.format(100 * (1 - float(len(X_resampled))/ len(X)))) + +ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1], + label="Class #0", alpha=.5, edgecolor=almost_black, + facecolor=palette[0], linewidth=0.15) +ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1], + label="Class #1", alpha=.5, edgecolor=almost_black, + facecolor=palette[2], linewidth=0.15) +ax2.set_title('Edited nearest neighbours') + +# Apply the RENN +print('RENN') +renn = RepeatedEditedNearestNeighbours() +X_resampled, y_resampled = renn.fit_sample(X, y) +X_res_vis = pca.transform(X_resampled) +print('Reduced {:.2f}\%'.format(100 * (1 - float(len(X_resampled))/ len(X)))) + +ax3.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1], + label="Class #0", alpha=.5, edgecolor=almost_black, + facecolor=palette[0], linewidth=0.15) +ax3.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1], + label="Class #1", alpha=.5, edgecolor=almost_black, + facecolor=palette[2], linewidth=0.15) +ax3.set_title('Repeated Edited nearest neighbours') + +# Apply the AllKNN +print('AllKNN') +allknn = AllKNN() +X_resampled, y_resampled = allknn.fit_sample(X, y) +X_res_vis = pca.transform(X_resampled) +print('Reduced {:.2f}\%'.format(100 * (1 - float(len(X_resampled))/ len(X)))) + +ax4.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1], + label="Class #0", alpha=.5, edgecolor=almost_black, + facecolor=palette[0], linewidth=0.15) +ax4.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1], + label="Class #1", alpha=.5, edgecolor=almost_black, + facecolor=palette[2], linewidth=0.15) +ax4.set_title('AllKNN') + +plt.show() diff --git a/imblearn/under_sampling/__init__.py b/imblearn/under_sampling/__init__.py index d58c15d87..7fe922ddb 100644 --- a/imblearn/under_sampling/__init__.py +++ b/imblearn/under_sampling/__init__.py @@ -12,6 +12,7 @@ from .neighbourhood_cleaning_rule import NeighbourhoodCleaningRule from .edited_nearest_neighbours import EditedNearestNeighbours from .edited_nearest_neighbours import RepeatedEditedNearestNeighbours +from .edited_nearest_neighbours import AllKNN from .instance_hardness_threshold import InstanceHardnessThreshold __all__ = ['RandomUnderSampler', @@ -23,4 +24,5 @@ 'NeighbourhoodCleaningRule', 'EditedNearestNeighbours', 'RepeatedEditedNearestNeighbours', + 'AllKNN', 'InstanceHardnessThreshold'] diff --git a/imblearn/under_sampling/edited_nearest_neighbours.py b/imblearn/under_sampling/edited_nearest_neighbours.py index ddd14bc38..4eece471e 100644 --- a/imblearn/under_sampling/edited_nearest_neighbours.py +++ b/imblearn/under_sampling/edited_nearest_neighbours.py @@ -382,3 +382,173 @@ def _sample(self, X, y): return X_resampled, y_resampled, idx_under else: return X_resampled, y_resampled + + +class AllKNN(SamplerMixin): + """Class to perform under-sampling based on the AllKNN method. + + Parameters + ---------- + return_indices : bool, optional (default=False) + Whether or not to return the indices of the samples randomly + selected from the majority class. + + random_state : int, RandomState instance or None, optional (default=None) + If int, random_state is the seed used by the random number generator; + If RandomState instance, random_state is the random number generator; + If None, the random number generator is the RandomState instance used + by np.random. + + size_ngh : int, optional (default=3) + Size of the neighbourhood to consider to compute the average + distance to the minority point samples. + + kind_sel : str, optional (default='all') + Strategy to use in order to exclude samples. + + - If 'all', all neighbours will have to agree with the samples of + interest to not be excluded. + - If 'mode', the majority vote of the neighbours will be used in + order to exclude a sample. + + n_jobs : int, optional (default=-1) + The number of thread to open when it is possible. + + Attributes + ---------- + min_c_ : str or int + The identifier of the minority class. + + max_c_ : str or int + The identifier of the majority class. + + stats_c_ : dict of str/int : int + A dictionary in which the number of occurences of each class is + reported. + + X_shape_ : tuple of int + Shape of the data `X` during fitting. + + Notes + ----- + The method is based on [1]_. + + This class supports multi-class. + + Examples + -------- + + >>> from collections import Counter + >>> from sklearn.datasets import make_classification + >>> from imblearn.under_sampling import AllKNN + >>> X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], + ... n_informative=3, n_redundant=1, flip_y=0, + ... n_features=20, n_clusters_per_class=1, + ... n_samples=1000, random_state=10) + >>> print('Original dataset shape {}'.format(Counter(y))) + Original dataset shape Counter({1: 900, 0: 100}) + >>> allknn = AllKNN(random_state=42) + >>> X_res, y_res = allknn.fit_sample(X, y) + >>> print('Resampled dataset shape {}'.format(Counter(y_res))) + Resampled dataset shape Counter({1: 883, 0: 100}) + + References + ---------- + .. [1] I. Tomek, "An Experiment with the Edited Nearest-Neighbor + Rule," IEEE Transactions on Systems, Man, and Cybernetics, vol. 6(6), + pp. 448-452, June 1976. + + """ + + def __init__(self, return_indices=False, random_state=None, + size_ngh=3, kind_sel='all', n_jobs=-1): + super(AllKNN, self).__init__() + self.return_indices = return_indices + self.random_state = random_state + self.size_ngh = size_ngh + self.kind_sel = kind_sel + self.n_jobs = n_jobs + self.enn_ = EditedNearestNeighbours( + return_indices=self.return_indices, + random_state=self.random_state, + size_ngh=self.size_ngh, + kind_sel=self.kind_sel, + n_jobs=self.n_jobs) + + def fit(self, X, y): + """Find the classes statistics before to perform sampling. + + Parameters + ---------- + X : ndarray, shape (n_samples, n_features) + Matrix containing the data which have to be sampled. + + y : ndarray, shape (n_samples, ) + Corresponding label for each sample in X. + + Returns + ------- + self : object, + Return self. + + """ + super(AllKNN, self).fit(X, y) + self.enn_.fit(X, y) + + return self + + def _sample(self, X, y): + """Resample the dataset. + + Parameters + ---------- + X : ndarray, shape (n_samples, n_features) + Matrix containing the data which have to be sampled. + + y : ndarray, shape (n_samples, ) + Corresponding label for each sample in X. + + Returns + ------- + X_resampled : ndarray, shape (n_samples_new, n_features) + The array containing the resampled data. + + y_resampled : ndarray, shape (n_samples_new) + The corresponding label of `X_resampled` + + idx_under : ndarray, shape (n_samples, ) + If `return_indices` is `True`, a boolean array will be returned + containing the which samples have been selected. + + """ + + if self.kind_sel not in SEL_KIND: + raise NotImplementedError + + X_, y_ = X, y + + if self.return_indices: + idx_under = np.arange(X.shape[0], dtype=int) + + prev_len = y.shape[0] + + for curr_size_ngh in range(1, self.size_ngh + 1): + self.logger.debug('Apply ENN size_ngh #%s', curr_size_ngh) + # updating ENN size_ngh + self.enn_.size_ngh = curr_size_ngh + if self.return_indices: + X_, y_, idx_ = self.enn_.fit_sample(X_, y_) + idx_under = idx_under[idx_] + else: + X_, y_ = self.enn_.fit_sample(X_, y_) + + self.logger.info('Under-sampling performed: %s', Counter(y_)) + + X_resampled, y_resampled = X_, y_ + + # Check if the indices of the samples selected should be returned too + if self.return_indices: + # Return the indices of interest + return X_resampled, y_resampled, idx_under + else: + return X_resampled, y_resampled diff --git a/imblearn/under_sampling/tests/data/allknn_idx.npy b/imblearn/under_sampling/tests/data/allknn_idx.npy new file mode 100644 index 000000000..ca1d38541 Binary files /dev/null and b/imblearn/under_sampling/tests/data/allknn_idx.npy differ diff --git a/imblearn/under_sampling/tests/data/allknn_x.npy b/imblearn/under_sampling/tests/data/allknn_x.npy new file mode 100644 index 000000000..9580fb257 Binary files /dev/null and b/imblearn/under_sampling/tests/data/allknn_x.npy differ diff --git a/imblearn/under_sampling/tests/data/allknn_x_mode.npy b/imblearn/under_sampling/tests/data/allknn_x_mode.npy new file mode 100644 index 000000000..8e958558b Binary files /dev/null and b/imblearn/under_sampling/tests/data/allknn_x_mode.npy differ diff --git a/imblearn/under_sampling/tests/data/allknn_y.npy b/imblearn/under_sampling/tests/data/allknn_y.npy new file mode 100644 index 000000000..f8168fcae Binary files /dev/null and b/imblearn/under_sampling/tests/data/allknn_y.npy differ diff --git a/imblearn/under_sampling/tests/data/allknn_y_mode.npy b/imblearn/under_sampling/tests/data/allknn_y_mode.npy new file mode 100644 index 000000000..cce2119cf Binary files /dev/null and b/imblearn/under_sampling/tests/data/allknn_y_mode.npy differ diff --git a/imblearn/under_sampling/tests/test_allknn.py b/imblearn/under_sampling/tests/test_allknn.py new file mode 100644 index 000000000..673b222ec --- /dev/null +++ b/imblearn/under_sampling/tests/test_allknn.py @@ -0,0 +1,130 @@ +"""Test the module repeated edited nearest neighbour.""" +from __future__ import print_function + +import os + +import numpy as np +from numpy.testing import assert_raises +from numpy.testing import assert_equal +from numpy.testing import assert_array_equal +from numpy.testing import assert_array_almost_equal +from numpy.testing import assert_warns + +from sklearn.datasets import make_classification +from sklearn.utils.estimator_checks import check_estimator + +from imblearn.under_sampling import AllKNN + +# Generate a global dataset to use +RND_SEED = 0 +X, Y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], + n_informative=3, n_redundant=1, flip_y=0, + n_features=20, n_clusters_per_class=1, + n_samples=5000, random_state=RND_SEED) + + +def test_allknn_sk_estimator(): + """Test the sklearn estimator compatibility""" + check_estimator(AllKNN) + + +def test_allknn_init(): + """Test the initialisation of the object""" + + # Define a ratio + allknn = AllKNN(random_state=RND_SEED) + + assert_equal(allknn.size_ngh, 3) + assert_equal(allknn.kind_sel, 'all') + assert_equal(allknn.n_jobs, -1) + assert_equal(allknn.random_state, RND_SEED) + + +def test_allknn_fit_single_class(): + """Test either if an error when there is a single class""" + + # Create the object + allknn = AllKNN(random_state=RND_SEED) + # Resample the data + # Create a wrong y + y_single_class = np.zeros((X.shape[0], )) + assert_warns(RuntimeWarning, allknn.fit, X, y_single_class) + + +def test_allknn_fit(): + """Test the fitting method""" + + # Create the object + allknn = AllKNN(random_state=RND_SEED) + # Fit the data + allknn.fit(X, Y) + + # Check if the data information have been computed + assert_equal(allknn.min_c_, 0) + assert_equal(allknn.maj_c_, 1) + assert_equal(allknn.stats_c_[0], 500) + assert_equal(allknn.stats_c_[1], 4500) + + +def test_allknn_sample_wt_fit(): + """Test either if an error is raised when sample is called before + fitting""" + + # Create the object + allknn = AllKNN(random_state=RND_SEED) + assert_raises(RuntimeError, allknn.sample, X, Y) + + +def test_allknn_fit_sample(): + """Test the fit sample routine""" + + # Resample the data + allknn = AllKNN(random_state=RND_SEED) + X_resampled, y_resampled = allknn.fit_sample(X, Y) + + currdir = os.path.dirname(os.path.abspath(__file__)) + X_gt = np.load(os.path.join(currdir, 'data', 'allknn_x.npy')) + y_gt = np.load(os.path.join(currdir, 'data', 'allknn_y.npy')) + assert_array_almost_equal(X_resampled, X_gt) + assert_array_almost_equal(y_resampled, y_gt) + + +def test_allknn_fit_sample_with_indices(): + """Test the fit sample routine with indices support""" + + # Resample the data + allknn = AllKNN(return_indices=True, random_state=RND_SEED) + X_resampled, y_resampled, idx_under = allknn.fit_sample(X, Y) + + currdir = os.path.dirname(os.path.abspath(__file__)) + X_gt = np.load(os.path.join(currdir, 'data', 'allknn_x.npy')) + y_gt = np.load(os.path.join(currdir, 'data', 'allknn_y.npy')) + idx_gt = np.load(os.path.join(currdir, 'data', 'allknn_idx.npy')) + assert_array_almost_equal(X_resampled, X_gt) + assert_array_almost_equal(y_resampled, y_gt) + assert_array_almost_equal(idx_under, idx_gt) + + +def test_allknn_fit_sample_mode(): + """Test the fit sample routine using the mode as selection""" + + # Resample the data + allknn = AllKNN(random_state=RND_SEED, kind_sel='mode') + X_resampled, y_resampled = allknn.fit_sample(X, Y) + + currdir = os.path.dirname(os.path.abspath(__file__)) + X_gt = np.load(os.path.join(currdir, 'data', 'allknn_x_mode.npy')) + y_gt = np.load(os.path.join(currdir, 'data', 'allknn_y_mode.npy')) + assert_array_equal(X_resampled, X_gt) + assert_array_almost_equal(y_resampled, y_gt) + + +def test_allknn_sample_wrong_X(): + """Test either if an error is raised when X is different at fitting + and sampling""" + + # Create the object + allknn = AllKNN(random_state=RND_SEED) + allknn.fit(X, Y) + assert_raises(RuntimeError, allknn.sample, np.random.random((100, 40)), + np.array([0] * 50 + [1] * 50))