From 53a6f05006b6889199162f47b28bacfe3ea86237 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 27 Jul 2016 00:54:41 +0200 Subject: [PATCH 01/24] Update the changelog --- doc/whats_new.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index ba2a79064..0b87638eb 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -16,6 +16,10 @@ Changelog - Added doctest in the documentation. - Added AllKNN under sampling technique. +Enhancement +~~~~~~~~~~~ + +- Validate the type of target in binary samplers. A warning is raised for the moment. .. _changes_0_1: From 86f2c180e17eca1c65e333189a402a25c34f79a7 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 27 Jul 2016 00:54:54 +0200 Subject: [PATCH 02/24] Check multiclass for SMOTEENN --- imblearn/combine/smote_enn.py | 8 ++++++++ imblearn/combine/tests/test_smote_enn.py | 15 +++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/imblearn/combine/smote_enn.py b/imblearn/combine/smote_enn.py index 2f78666f2..257b79eed 100644 --- a/imblearn/combine/smote_enn.py +++ b/imblearn/combine/smote_enn.py @@ -2,6 +2,10 @@ from __future__ import print_function from __future__ import division +import warnings + +from sklearn.utils.multiclass import type_of_target + from ..over_sampling import SMOTE from ..under_sampling import EditedNearestNeighbours from ..base import SamplerMixin @@ -145,6 +149,10 @@ def fit(self, X, y): super(SMOTEENN, self).fit(X, y) + # Check that y is binary + if not type_of_target(y) == 'binary': + warnings.warn('The target type should be binary.') + # Fit using SMOTE self.sm.fit(X, y) diff --git a/imblearn/combine/tests/test_smote_enn.py b/imblearn/combine/tests/test_smote_enn.py index 9232b60a3..3102f9b31 100644 --- a/imblearn/combine/tests/test_smote_enn.py +++ b/imblearn/combine/tests/test_smote_enn.py @@ -131,3 +131,18 @@ def test_sample_wrong_X(): sm.fit(X, Y) assert_raises(RuntimeError, sm.sample, np.random.random((100, 40)), np.array([0] * 50 + [1] * 50)) + + +def test_senn_multiclass_error(): + """ Test either if an error is raised when the target are not binary + type. """ + + # continuous case + y = np.linspace(0, 1, 5000) + sm = SMOTEENN(random_state=RND_SEED) + assert_warns(UserWarning, sm.fit, X, y) + + # multiclass case + y = np.array([0] * 2000 + [1] * 2000 + [2] * 1000) + sm = SMOTEENN(random_state=RND_SEED) + assert_warns(UserWarning, sm.fit, X, y) From 447d9580f74c35a936c8a46326b93b64c3ca362b Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 27 Jul 2016 13:56:41 +0200 Subject: [PATCH 03/24] Check multiclass for SMOTETomek --- imblearn/combine/smote_tomek.py | 8 ++++++++ imblearn/combine/tests/test_smote_tomek.py | 15 +++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/imblearn/combine/smote_tomek.py b/imblearn/combine/smote_tomek.py index 9f0df0c46..591e87b5c 100644 --- a/imblearn/combine/smote_tomek.py +++ b/imblearn/combine/smote_tomek.py @@ -3,6 +3,10 @@ from __future__ import print_function from __future__ import division +import warnings + +from sklearn.utils.multiclass import type_of_target + from ..over_sampling import SMOTE from ..under_sampling import TomekLinks from ..base import SamplerMixin @@ -140,6 +144,10 @@ def fit(self, X, y): super(SMOTETomek, self).fit(X, y) + # Check that y is binary + if not type_of_target(y) == 'binary': + warnings.warn('The target type should be binary.') + # Fit using SMOTE self.sm.fit(X, y) diff --git a/imblearn/combine/tests/test_smote_tomek.py b/imblearn/combine/tests/test_smote_tomek.py index 252e510c8..010e3c712 100644 --- a/imblearn/combine/tests/test_smote_tomek.py +++ b/imblearn/combine/tests/test_smote_tomek.py @@ -131,3 +131,18 @@ def test_sample_wrong_X(): sm.fit(X, Y) assert_raises(RuntimeError, sm.sample, np.random.random((100, 40)), np.array([0] * 50 + [1] * 50)) + + +def test_multiclass_error(): + """ Test either if an error is raised when the target are not binary + type. """ + + # continuous case + y = np.linspace(0, 1, 5000) + sm = SMOTETomek(random_state=RND_SEED) + assert_warns(UserWarning, sm.fit, X, y) + + # multiclass case + y = np.array([0] * 2000 + [1] * 2000 + [2] * 1000) + sm = SMOTETomek(random_state=RND_SEED) + assert_warns(UserWarning, sm.fit, X, y) From 38290a1cef9d8bfd2b1a3fa5926727444e5aa9a2 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 27 Jul 2016 14:01:47 +0200 Subject: [PATCH 04/24] Check multiclass for BalanceCascade --- imblearn/ensemble/balance_cascade.py | 30 +++++++++++++++++++ .../ensemble/tests/test_balance_cascade.py | 15 ++++++++++ 2 files changed, 45 insertions(+) diff --git a/imblearn/ensemble/balance_cascade.py b/imblearn/ensemble/balance_cascade.py index 3bf2297d3..335d87589 100644 --- a/imblearn/ensemble/balance_cascade.py +++ b/imblearn/ensemble/balance_cascade.py @@ -1,8 +1,12 @@ """Class to perform under-sampling using balace cascade.""" from __future__ import print_function + +import warnings + import numpy as np +from sklearn.utils.multiclass import type_of_target from sklearn.utils import check_random_state from ..base import SamplerMixin @@ -111,6 +115,32 @@ def __init__(self, ratio='auto', return_indices=False, random_state=None, self.bootstrap = bootstrap self.kwargs = kwargs + def fit(self, X, y): + """Find the classes statistics before to perform sampling. + + Parameters + ---------- + X : ndarray, shape (n_samples, n_features) + Matrix containing the data which have to be sampled. + + y : ndarray, shape (n_samples, ) + Corresponding label for each sample in X. + + Returns + ------- + self : object, + Return self. + + """ + + super(BalanceCascade, self).fit(X, y) + + # Check that y is binary + if not type_of_target(y) == 'binary': + warnings.warn('The target type should be binary.') + + return self + def _sample(self, X, y): """Resample the dataset. diff --git a/imblearn/ensemble/tests/test_balance_cascade.py b/imblearn/ensemble/tests/test_balance_cascade.py index c416ed3eb..12422b503 100644 --- a/imblearn/ensemble/tests/test_balance_cascade.py +++ b/imblearn/ensemble/tests/test_balance_cascade.py @@ -339,3 +339,18 @@ def test_sample_wrong_X(): bc.fit(X, Y) assert_raises(RuntimeError, bc.sample, np.random.random((100, 40)), np.array([0] * 50 + [1] * 50)) + + +def test_multiclass_error(): + """ Test either if an error is raised when the target are not binary + type. """ + + # continuous case + y = np.linspace(0, 1, 5000) + bc = BalanceCascade(random_state=RND_SEED) + assert_warns(UserWarning, bc.fit, X, y) + + # multiclass case + y = np.array([0] * 2000 + [1] * 2000 + [2] * 1000) + bc = BalanceCascade(random_state=RND_SEED) + assert_warns(UserWarning, bc.fit, X, y) From 9ceee1ac5d6e5ebfc7d60ce1971aba0a705641ae Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 27 Jul 2016 14:04:56 +0200 Subject: [PATCH 05/24] Check multiclass for EasyEnsemble --- imblearn/ensemble/easy_ensemble.py | 30 +++++++++++++++++++ imblearn/ensemble/tests/test_easy_ensemble.py | 15 ++++++++++ 2 files changed, 45 insertions(+) diff --git a/imblearn/ensemble/easy_ensemble.py b/imblearn/ensemble/easy_ensemble.py index 6938d9a5d..fabe95241 100644 --- a/imblearn/ensemble/easy_ensemble.py +++ b/imblearn/ensemble/easy_ensemble.py @@ -1,8 +1,12 @@ """Class to perform under-sampling using easy ensemble.""" from __future__ import print_function +import warnings + import numpy as np +from sklearn.utils.multiclass import type_of_target + from ..base import SamplerMixin from ..under_sampling import RandomUnderSampler @@ -90,6 +94,32 @@ def __init__(self, ratio='auto', return_indices=False, self.replacement = replacement self.n_subsets = n_subsets + def fit(self, X, y): + """Find the classes statistics before to perform sampling. + + Parameters + ---------- + X : ndarray, shape (n_samples, n_features) + Matrix containing the data which have to be sampled. + + y : ndarray, shape (n_samples, ) + Corresponding label for each sample in X. + + Returns + ------- + self : object, + Return self. + + """ + + super(EasyEnsemble, self).fit(X, y) + + # Check that y is binary + if not type_of_target(y) == 'binary': + warnings.warn('The target type should be binary.') + + return self + def _sample(self, X, y): """Resample the dataset. diff --git a/imblearn/ensemble/tests/test_easy_ensemble.py b/imblearn/ensemble/tests/test_easy_ensemble.py index 763db4e65..e3cb0a9d8 100644 --- a/imblearn/ensemble/tests/test_easy_ensemble.py +++ b/imblearn/ensemble/tests/test_easy_ensemble.py @@ -170,3 +170,18 @@ def test_sample_wrong_X(): ee.fit(X, Y) assert_raises(RuntimeError, ee.sample, np.random.random((100, 40)), np.array([0] * 50 + [1] * 50)) + + +def test_multiclass_error(): + """ Test either if an error is raised when the target are not binary + type. """ + + # continuous case + y = np.linspace(0, 1, 5000) + ee = EasyEnsemble(random_state=RND_SEED) + assert_warns(UserWarning, ee.fit, X, y) + + # multiclass case + y = np.array([0] * 2000 + [1] * 2000 + [2] * 1000) + ee = EasyEnsemble(random_state=RND_SEED) + assert_warns(UserWarning, ee.fit, X, y) From 81c49783757e4bc33fb7a5c496fe8ee32363ce28 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 27 Jul 2016 14:07:50 +0200 Subject: [PATCH 06/24] Check multiclass for ADASYN --- imblearn/ensemble/balance_cascade.py | 1 - imblearn/over_sampling/adasyn.py | 29 +++++++++++++++++++++ imblearn/over_sampling/tests/test_adasyn.py | 15 +++++++++++ 3 files changed, 44 insertions(+), 1 deletion(-) diff --git a/imblearn/ensemble/balance_cascade.py b/imblearn/ensemble/balance_cascade.py index 335d87589..b73bd40e2 100644 --- a/imblearn/ensemble/balance_cascade.py +++ b/imblearn/ensemble/balance_cascade.py @@ -1,7 +1,6 @@ """Class to perform under-sampling using balace cascade.""" from __future__ import print_function - import warnings import numpy as np diff --git a/imblearn/over_sampling/adasyn.py b/imblearn/over_sampling/adasyn.py index ac7b2f90c..ce9b14907 100644 --- a/imblearn/over_sampling/adasyn.py +++ b/imblearn/over_sampling/adasyn.py @@ -2,12 +2,15 @@ from __future__ import print_function from __future__ import division +import warnings + import numpy as np from collections import Counter from sklearn.neighbors import NearestNeighbors from sklearn.utils import check_random_state +from sklearn.utils.multiclass import type_of_target from ..base import SamplerMixin @@ -98,6 +101,32 @@ def __init__(self, self.nearest_neighbour = NearestNeighbors(n_neighbors=self.k + 1, n_jobs=self.n_jobs) + def fit(self, X, y): + """Find the classes statistics before to perform sampling. + + Parameters + ---------- + X : ndarray, shape (n_samples, n_features) + Matrix containing the data which have to be sampled. + + y : ndarray, shape (n_samples, ) + Corresponding label for each sample in X. + + Returns + ------- + self : object, + Return self. + + """ + + super(ADASYN, self).fit(X, y) + + # Check that y is binary + if not type_of_target(y) == 'binary': + warnings.warn('The target type should be binary.') + + return self + def _sample(self, X, y): """Resample the dataset. diff --git a/imblearn/over_sampling/tests/test_adasyn.py b/imblearn/over_sampling/tests/test_adasyn.py index 8976de402..0269f1b0b 100644 --- a/imblearn/over_sampling/tests/test_adasyn.py +++ b/imblearn/over_sampling/tests/test_adasyn.py @@ -146,3 +146,18 @@ def test_sample_wrong_X(): ada.fit(X, Y) assert_raises(RuntimeError, ada.sample, np.random.random((100, 40)), np.array([0] * 50 + [1] * 50)) + + +def test_multiclass_error(): + """ Test either if an error is raised when the target are not binary + type. """ + + # continuous case + y = np.linspace(0, 1, 5000) + ada = ADASYN(random_state=RND_SEED) + assert_warns(UserWarning, ada.fit, X, y) + + # multiclass case + y = np.array([0] * 2000 + [1] * 2000 + [2] * 1000) + ada = ADASYN(random_state=RND_SEED) + assert_warns(UserWarning, ada.fit, X, y) From c5d066ef5656c0ef05238946081707db6ea60ad1 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 27 Jul 2016 14:14:06 +0200 Subject: [PATCH 07/24] Check multiclass for SMOTE --- imblearn/over_sampling/smote.py | 29 ++++++++++++++++++++++ imblearn/over_sampling/tests/test_smote.py | 15 +++++++++++ 2 files changed, 44 insertions(+) diff --git a/imblearn/over_sampling/smote.py b/imblearn/over_sampling/smote.py index 5f9f12e85..57020fa3c 100644 --- a/imblearn/over_sampling/smote.py +++ b/imblearn/over_sampling/smote.py @@ -2,10 +2,13 @@ from __future__ import print_function from __future__ import division +import warnings + import numpy as np from sklearn.utils import check_array from sklearn.utils import check_random_state +from sklearn.utils.multiclass import type_of_target from sklearn.neighbors import NearestNeighbors from sklearn.svm import SVC @@ -124,6 +127,32 @@ def __init__(self, self.n_jobs = n_jobs self.kwargs = kwargs + def fit(self, X, y): + """Find the classes statistics before to perform sampling. + + Parameters + ---------- + X : ndarray, shape (n_samples, n_features) + Matrix containing the data which have to be sampled. + + y : ndarray, shape (n_samples, ) + Corresponding label for each sample in X. + + Returns + ------- + self : object, + Return self. + + """ + + super(SMOTE, self).fit(X, y) + + # Check that y is binary + if not type_of_target(y) == 'binary': + warnings.warn('The target type should be binary.') + + return self + def _in_danger_noise(self, samples, y, kind='danger'): """Estimate if a set of sample are in danger or noise. diff --git a/imblearn/over_sampling/tests/test_smote.py b/imblearn/over_sampling/tests/test_smote.py index 7d79da459..505187f87 100644 --- a/imblearn/over_sampling/tests/test_smote.py +++ b/imblearn/over_sampling/tests/test_smote.py @@ -196,3 +196,18 @@ def test_sample_wrong_X(): sm.fit(X, Y) assert_raises(RuntimeError, sm.sample, np.random.random((100, 40)), np.array([0] * 50 + [1] * 50)) + + +def test_multiclass_error(): + """ Test either if an error is raised when the target are not binary + type. """ + + # continuous case + y = np.linspace(0, 1, 5000) + sm = SMOTE(random_state=RND_SEED) + assert_warns(UserWarning, sm.fit, X, y) + + # multiclass case + y = np.array([0] * 2000 + [1] * 2000 + [2] * 1000) + sm = SMOTE(random_state=RND_SEED) + assert_warns(UserWarning, sm.fit, X, y) From 28f1901c5a8b3ba4f9d622ecbc2cfec51e4e467e Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 27 Jul 2016 14:27:20 +0200 Subject: [PATCH 08/24] Check multiclass for RandomOverSampler --- imblearn/over_sampling/random_over_sampler.py | 30 +++++++++++++++++++ .../tests/test_random_over_sampler.py | 30 +++++++++++++++++++ 2 files changed, 60 insertions(+) diff --git a/imblearn/over_sampling/random_over_sampler.py b/imblearn/over_sampling/random_over_sampler.py index 827a30755..eb998be87 100644 --- a/imblearn/over_sampling/random_over_sampler.py +++ b/imblearn/over_sampling/random_over_sampler.py @@ -2,11 +2,14 @@ from __future__ import print_function from __future__ import division +import warnings + import numpy as np from collections import Counter from sklearn.utils import check_random_state +from sklearn.utils.multiclass import type_of_target from ..base import SamplerMixin @@ -77,6 +80,33 @@ def __init__(self, super(RandomOverSampler, self).__init__(ratio=ratio) self.random_state = random_state + def fit(self, X, y): + """Find the classes statistics before to perform sampling. + + Parameters + ---------- + X : ndarray, shape (n_samples, n_features) + Matrix containing the data which have to be sampled. + + y : ndarray, shape (n_samples, ) + Corresponding label for each sample in X. + + Returns + ------- + self : object, + Return self. + + """ + + super(RandomOverSampler, self).fit(X, y) + + # Check that y is binary + if not (type_of_target(y) == 'binary' or + type_of_target(y) == 'multiclass'): + warnings.warn('The target type should be binary or multiclass.') + + return self + def _sample(self, X, y): """Resample the dataset. diff --git a/imblearn/over_sampling/tests/test_random_over_sampler.py b/imblearn/over_sampling/tests/test_random_over_sampler.py index 0d2f0aac2..e56ce127f 100644 --- a/imblearn/over_sampling/tests/test_random_over_sampler.py +++ b/imblearn/over_sampling/tests/test_random_over_sampler.py @@ -12,6 +12,8 @@ from sklearn.datasets import make_classification from sklearn.utils.estimator_checks import check_estimator +from collections import Counter + from imblearn.over_sampling import RandomOverSampler # Generate a global dataset to use @@ -146,3 +148,31 @@ def test_sample_wrong_X(): ros.fit(X, Y) assert_raises(RuntimeError, ros.sample, np.random.random((100, 40)), np.array([0] * 50 + [1] * 50)) + + +def test_continuous_error(): + """Test either if an error is raised when the target are continuous + type""" + + # continuous case + y = np.linspace(0, 1, 5000) + ros = RandomOverSampler(random_state=RND_SEED) + assert_warns(UserWarning, ros.fit, X, y) + + +def test_multiclass_fit_sample(): + """Test fit sample method with multiclass target""" + + # Make y to be multiclass + y = Y.copy() + y[0:1000] = 2 + + # Resample the data + ros = RandomOverSampler(random_state=RND_SEED) + X_resampled, y_resampled = ros.fit_sample(X, y) + + # Check the size of y + count_y_res = Counter(y_resampled) + assert_equal(count_y_res[0], 3600) + assert_equal(count_y_res[1], 3600) + assert_equal(count_y_res[2], 3600) From 55d120933f47c4f7d3c570760ef70654abe30e17 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 27 Jul 2016 14:32:37 +0200 Subject: [PATCH 09/24] Check multiclass for ClusteringCentroids --- imblearn/under_sampling/cluster_centroids.py | 30 ++++++++++++++ .../tests/test_cluster_centroids.py | 41 +++++++++++++++++++ 2 files changed, 71 insertions(+) diff --git a/imblearn/under_sampling/cluster_centroids.py b/imblearn/under_sampling/cluster_centroids.py index 82b2fd1b8..d77132f75 100644 --- a/imblearn/under_sampling/cluster_centroids.py +++ b/imblearn/under_sampling/cluster_centroids.py @@ -3,12 +3,15 @@ from __future__ import print_function from __future__ import division +import warnings + import numpy as np from collections import Counter from sklearn.cluster import KMeans from sklearn.utils import check_random_state +from sklearn.utils.multiclass import type_of_target from ..base import SamplerMixin @@ -87,6 +90,33 @@ def __init__(self, ratio='auto', random_state=None, n_jobs=-1, **kwargs): self.n_jobs = n_jobs self.kwargs = kwargs + def fit(self, X, y): + """Find the classes statistics before to perform sampling. + + Parameters + ---------- + X : ndarray, shape (n_samples, n_features) + Matrix containing the data which have to be sampled. + + y : ndarray, shape (n_samples, ) + Corresponding label for each sample in X. + + Returns + ------- + self : object, + Return self. + + """ + + super(ClusterCentroids, self).fit(X, y) + + # Check that y is binary + if not (type_of_target(y) == 'binary' or + type_of_target(y) == 'multiclass'): + warnings.warn('The target type should be binary or multiclass.') + + return self + def _sample(self, X, y): """Resample the dataset. diff --git a/imblearn/under_sampling/tests/test_cluster_centroids.py b/imblearn/under_sampling/tests/test_cluster_centroids.py index 44c4bf0dd..1b620d94a 100644 --- a/imblearn/under_sampling/tests/test_cluster_centroids.py +++ b/imblearn/under_sampling/tests/test_cluster_centroids.py @@ -12,6 +12,8 @@ from sklearn.datasets import make_classification from sklearn.utils.estimator_checks import check_estimator +from collections import Counter + from imblearn.under_sampling import ClusterCentroids # Generate a global dataset to use @@ -167,3 +169,42 @@ def test_fit_sample_half(): y_gt = np.load(os.path.join(currdir, 'data', 'cc_y_05.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) + + +def test_sample_wrong_X(): + """Test either if an error is raised when X is different at fitting + and sampling""" + + # Create the object + cc = ClusterCentroids(random_state=RND_SEED) + cc.fit(X, Y) + assert_raises(RuntimeError, cc.sample, np.random.random((100, 40)), + np.array([0] * 50 + [1] * 50)) + + +def test_continuous_error(): + """Test either if an error is raised when the target are continuous + type""" + + # continuous case + y = np.linspace(0, 1, 5000) + cc = ClusterCentroids(random_state=RND_SEED) + assert_warns(UserWarning, cc.fit, X, y) + + +def test_multiclass_fit_sample(): + """Test fit sample method with multiclass target""" + + # Make y to be multiclass + y = Y.copy() + y[0:1000] = 2 + + # Resample the data + cc = ClusterCentroids(random_state=RND_SEED) + X_resampled, y_resampled = cc.fit_sample(X, y) + + # Check the size of y + count_y_res = Counter(y_resampled) + assert_equal(count_y_res[0], 400) + assert_equal(count_y_res[1], 400) + assert_equal(count_y_res[2], 400) From 0b9b5ac8437e9f25f9aa5657e4b2163997789481 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 27 Jul 2016 14:40:09 +0200 Subject: [PATCH 10/24] Check multiclass for CondensedNearestNeighbour --- .../condensed_nearest_neighbour.py | 30 +++++++++++++++++++ .../tests/test_condensed_nearest_neighbour.py | 30 +++++++++++++++++++ 2 files changed, 60 insertions(+) diff --git a/imblearn/under_sampling/condensed_nearest_neighbour.py b/imblearn/under_sampling/condensed_nearest_neighbour.py index 3c9f19ea1..eaa171d6a 100644 --- a/imblearn/under_sampling/condensed_nearest_neighbour.py +++ b/imblearn/under_sampling/condensed_nearest_neighbour.py @@ -3,11 +3,14 @@ from __future__ import print_function from __future__ import division +import warnings + import numpy as np from collections import Counter from sklearn.utils import check_random_state +from sklearn.utils.multiclass import type_of_target from sklearn.neighbors import KNeighborsClassifier from ..base import SamplerMixin @@ -98,6 +101,33 @@ def __init__(self, return_indices=False, random_state=None, self.n_jobs = n_jobs self.kwargs = kwargs + def fit(self, X, y): + """Find the classes statistics before to perform sampling. + + Parameters + ---------- + X : ndarray, shape (n_samples, n_features) + Matrix containing the data which have to be sampled. + + y : ndarray, shape (n_samples, ) + Corresponding label for each sample in X. + + Returns + ------- + self : object, + Return self. + + """ + + super(CondensedNearestNeighbour, self).fit(X, y) + + # Check that y is binary + if not (type_of_target(y) == 'binary' or + type_of_target(y) == 'multiclass'): + warnings.warn('The target type should be binary or multiclass.') + + return self + def _sample(self, X, y): """Resample the dataset. diff --git a/imblearn/under_sampling/tests/test_condensed_nearest_neighbour.py b/imblearn/under_sampling/tests/test_condensed_nearest_neighbour.py index 3270148c3..887a47eda 100644 --- a/imblearn/under_sampling/tests/test_condensed_nearest_neighbour.py +++ b/imblearn/under_sampling/tests/test_condensed_nearest_neighbour.py @@ -12,6 +12,8 @@ from sklearn.datasets import make_classification from sklearn.utils.estimator_checks import check_estimator +from collections import Counter + from imblearn.under_sampling import CondensedNearestNeighbour # Generate a global dataset to use @@ -112,3 +114,31 @@ def test_cnn_sample_wrong_X(): cnn.fit(X, Y) assert_raises(RuntimeError, cnn.sample, np.random.random((100, 40)), np.array([0] * 50 + [1] * 50)) + + +def test_continuous_error(): + """Test either if an error is raised when the target are continuous + type""" + + # continuous case + y = np.linspace(0, 1, 5000) + cnn = CondensedNearestNeighbour(random_state=RND_SEED) + assert_warns(UserWarning, cnn.fit, X, y) + + +def test_multiclass_fit_sample(): + """Test fit sample method with multiclass target""" + + # Make y to be multiclass + y = Y.copy() + y[0:1000] = 2 + + # Resample the data + cnn = CondensedNearestNeighbour(random_state=RND_SEED) + X_resampled, y_resampled = cnn.fit_sample(X, y) + + # Check the size of y + count_y_res = Counter(y_resampled) + assert_equal(count_y_res[0], 400) + assert_equal(count_y_res[1], 113) + assert_equal(count_y_res[2], 147) From 57be3f006b7c7ff00ded4462f0d79f3b8d03d06b Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 27 Jul 2016 14:44:32 +0200 Subject: [PATCH 11/24] Check multiclass for EditedNearestNeighbours --- .../edited_nearest_neighbours.py | 30 +++++++++++++++++++ .../tests/test_edited_nearest_neighbours.py | 30 +++++++++++++++++++ 2 files changed, 60 insertions(+) diff --git a/imblearn/under_sampling/edited_nearest_neighbours.py b/imblearn/under_sampling/edited_nearest_neighbours.py index 5dc97724f..04859e5a2 100644 --- a/imblearn/under_sampling/edited_nearest_neighbours.py +++ b/imblearn/under_sampling/edited_nearest_neighbours.py @@ -3,6 +3,8 @@ from __future__ import print_function from __future__ import division +import warnings + import numpy as np from collections import Counter @@ -10,6 +12,7 @@ from scipy.stats import mode from sklearn.neighbors import NearestNeighbors +from sklearn.utils.multiclass import type_of_target from ..base import SamplerMixin @@ -103,6 +106,33 @@ def __init__(self, return_indices=False, random_state=None, self.kind_sel = kind_sel self.n_jobs = n_jobs + def fit(self, X, y): + """Find the classes statistics before to perform sampling. + + Parameters + ---------- + X : ndarray, shape (n_samples, n_features) + Matrix containing the data which have to be sampled. + + y : ndarray, shape (n_samples, ) + Corresponding label for each sample in X. + + Returns + ------- + self : object, + Return self. + + """ + + super(EditedNearestNeighbours, self).fit(X, y) + + # Check that y is binary + if not (type_of_target(y) == 'binary' or + type_of_target(y) == 'multiclass'): + warnings.warn('The target type should be binary or multiclass.') + + return self + def _sample(self, X, y): """Resample the dataset. diff --git a/imblearn/under_sampling/tests/test_edited_nearest_neighbours.py b/imblearn/under_sampling/tests/test_edited_nearest_neighbours.py index 8f6af38d6..c2d88fcc8 100644 --- a/imblearn/under_sampling/tests/test_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/tests/test_edited_nearest_neighbours.py @@ -12,6 +12,8 @@ from sklearn.datasets import make_classification from sklearn.utils.estimator_checks import check_estimator +from collections import Counter + from imblearn.under_sampling import EditedNearestNeighbours # Generate a global dataset to use @@ -127,3 +129,31 @@ def test_enn_sample_wrong_X(): enn.fit(X, Y) assert_raises(RuntimeError, enn.sample, np.random.random((100, 40)), np.array([0] * 50 + [1] * 50)) + + +def test_continuous_error(): + """Test either if an error is raised when the target are continuous + type""" + + # continuous case + y = np.linspace(0, 1, 5000) + enn = EditedNearestNeighbours(random_state=RND_SEED) + assert_warns(UserWarning, enn.fit, X, y) + + +def test_multiclass_fit_sample(): + """Test fit sample method with multiclass target""" + + # Make y to be multiclass + y = Y.copy() + y[0:1000] = 2 + + # Resample the data + enn = EditedNearestNeighbours(random_state=RND_SEED) + X_resampled, y_resampled = enn.fit_sample(X, y) + + # Check the size of y + count_y_res = Counter(y_resampled) + assert_equal(count_y_res[0], 400) + assert_equal(count_y_res[1], 1836) + assert_equal(count_y_res[2], 5) From 1ef665606bd048321994acc9f7d38128bbdd25a9 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 27 Jul 2016 14:46:06 +0200 Subject: [PATCH 12/24] Check multiclass for RepeatedEditedNearestNeighbours --- ...test_repeated_edited_nearest_neighbours.py | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/imblearn/under_sampling/tests/test_repeated_edited_nearest_neighbours.py b/imblearn/under_sampling/tests/test_repeated_edited_nearest_neighbours.py index 2c959b09b..d4782fff1 100644 --- a/imblearn/under_sampling/tests/test_repeated_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/tests/test_repeated_edited_nearest_neighbours.py @@ -12,6 +12,8 @@ from sklearn.datasets import make_classification from sklearn.utils.estimator_checks import check_estimator +from collections import Counter + from imblearn.under_sampling import RepeatedEditedNearestNeighbours # Generate a global dataset to use @@ -140,3 +142,31 @@ def test_renn_sample_wrong_X(): renn.fit(X, Y) assert_raises(RuntimeError, renn.sample, np.random.random((100, 40)), np.array([0] * 50 + [1] * 50)) + + +def test_continuous_error(): + """Test either if an error is raised when the target are continuous + type""" + + # continuous case + y = np.linspace(0, 1, 5000) + enn = RepeatedEditedNearestNeighbours(random_state=RND_SEED) + assert_warns(UserWarning, enn.fit, X, y) + + +def test_multiclass_fit_sample(): + """Test fit sample method with multiclass target""" + + # Make y to be multiclass + y = Y.copy() + y[0:1000] = 2 + + # Resample the data + enn = RepeatedEditedNearestNeighbours(random_state=RND_SEED) + X_resampled, y_resampled = enn.fit_sample(X, y) + + # Check the size of y + count_y_res = Counter(y_resampled) + assert_equal(count_y_res[0], 378) + assert_equal(count_y_res[1], 1828) + assert_equal(count_y_res[2], 5) From 812d7f96a0a48e302147824e5272368631121fcd Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 27 Jul 2016 14:47:58 +0200 Subject: [PATCH 13/24] Check multiclass for AllKNN --- imblearn/under_sampling/tests/test_allknn.py | 30 ++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/imblearn/under_sampling/tests/test_allknn.py b/imblearn/under_sampling/tests/test_allknn.py index 673b222ec..694df5e00 100644 --- a/imblearn/under_sampling/tests/test_allknn.py +++ b/imblearn/under_sampling/tests/test_allknn.py @@ -13,6 +13,8 @@ from sklearn.datasets import make_classification from sklearn.utils.estimator_checks import check_estimator +from collections import Counter + from imblearn.under_sampling import AllKNN # Generate a global dataset to use @@ -128,3 +130,31 @@ def test_allknn_sample_wrong_X(): allknn.fit(X, Y) assert_raises(RuntimeError, allknn.sample, np.random.random((100, 40)), np.array([0] * 50 + [1] * 50)) + + +def test_continuous_error(): + """Test either if an error is raised when the target are continuous + type""" + + # continuous case + y = np.linspace(0, 1, 5000) + ann = AllKNN(random_state=RND_SEED) + assert_warns(UserWarning, ann.fit, X, y) + + +def test_multiclass_fit_sample(): + """Test fit sample method with multiclass target""" + + # Make y to be multiclass + y = Y.copy() + y[0:1000] = 2 + + # Resample the data + ann = AllKNN(random_state=RND_SEED) + X_resampled, y_resampled = ann.fit_sample(X, y) + + # Check the size of y + count_y_res = Counter(y_resampled) + assert_equal(count_y_res[0], 341) + assert_equal(count_y_res[1], 2485) + assert_equal(count_y_res[2], 212) From 82cf6d67cd9425cc09f99d78ecf36d0cf75594e5 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 27 Jul 2016 14:50:53 +0200 Subject: [PATCH 14/24] Check multiclass for InstanceHardnessThreshold --- .../instance_hardness_threshold.py | 29 +++++++++++++++++++ .../tests/test_instance_hardness_threshold.py | 15 ++++++++++ 2 files changed, 44 insertions(+) diff --git a/imblearn/under_sampling/instance_hardness_threshold.py b/imblearn/under_sampling/instance_hardness_threshold.py index 208ca8492..6c258c753 100644 --- a/imblearn/under_sampling/instance_hardness_threshold.py +++ b/imblearn/under_sampling/instance_hardness_threshold.py @@ -3,11 +3,14 @@ from __future__ import print_function from __future__ import division +import warnings + import numpy as np from collections import Counter from sklearn.cross_validation import StratifiedKFold +from sklearn.utils.multiclass import type_of_target from ..base import SamplerMixin @@ -110,6 +113,32 @@ def __init__(self, estimator='linear-svm', ratio='auto', self.cv = cv self.n_jobs = n_jobs + def fit(self, X, y): + """Find the classes statistics before to perform sampling. + + Parameters + ---------- + X : ndarray, shape (n_samples, n_features) + Matrix containing the data which have to be sampled. + + y : ndarray, shape (n_samples, ) + Corresponding label for each sample in X. + + Returns + ------- + self : object, + Return self. + + """ + + super(InstanceHardnessThreshold, self).fit(X, y) + + # Check that y is binary + if not type_of_target(y) == 'binary': + warnings.warn('The target type should be binary.') + + return self + def _sample(self, X, y): """Resample the dataset. diff --git a/imblearn/under_sampling/tests/test_instance_hardness_threshold.py b/imblearn/under_sampling/tests/test_instance_hardness_threshold.py index d4afbe227..ee4e2f9a3 100644 --- a/imblearn/under_sampling/tests/test_instance_hardness_threshold.py +++ b/imblearn/under_sampling/tests/test_instance_hardness_threshold.py @@ -270,3 +270,18 @@ def test_iht_sample_wrong_X(): iht.fit(X, Y) assert_raises(RuntimeError, iht.sample, np.random.random((100, 40)), np.array([0] * 50 + [1] * 50)) + + +def test_multiclass_error(): + """ Test either if an error is raised when the target are not binary + type. """ + + # continuous case + y = np.linspace(0, 1, 5000) + iht = InstanceHardnessThreshold(random_state=RND_SEED) + assert_warns(UserWarning, iht.fit, X, y) + + # multiclass case + y = np.array([0] * 2000 + [1] * 2000 + [2] * 1000) + iht = InstanceHardnessThreshold(random_state=RND_SEED) + assert_warns(UserWarning, iht.fit, X, y) From 43a10ce6a0fadd0e7d5c7d83ecafd37c8ad1856b Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 27 Jul 2016 14:55:38 +0200 Subject: [PATCH 15/24] Check multiclass for NearMiss --- imblearn/under_sampling/nearmiss.py | 30 +++++++++++++++++++ .../neighbourhood_cleaning_rule.py | 30 +++++++++++++++++++ .../under_sampling/tests/test_nearmiss_1.py | 30 +++++++++++++++++++ .../under_sampling/tests/test_nearmiss_2.py | 30 +++++++++++++++++++ .../under_sampling/tests/test_nearmiss_3.py | 30 +++++++++++++++++++ 5 files changed, 150 insertions(+) diff --git a/imblearn/under_sampling/nearmiss.py b/imblearn/under_sampling/nearmiss.py index 29cc81221..31b1653c8 100644 --- a/imblearn/under_sampling/nearmiss.py +++ b/imblearn/under_sampling/nearmiss.py @@ -2,11 +2,14 @@ from __future__ import print_function from __future__ import division +import warnings + import numpy as np from collections import Counter from sklearn.neighbors import NearestNeighbors +from sklearn.utils.multiclass import type_of_target from ..base import SamplerMixin @@ -108,6 +111,33 @@ def __init__(self, ratio='auto', return_indices=False, random_state=None, self.n_jobs = n_jobs self.kwargs = kwargs + def fit(self, X, y): + """Find the classes statistics before to perform sampling. + + Parameters + ---------- + X : ndarray, shape (n_samples, n_features) + Matrix containing the data which have to be sampled. + + y : ndarray, shape (n_samples, ) + Corresponding label for each sample in X. + + Returns + ------- + self : object, + Return self. + + """ + + super(NearMiss, self).fit(X, y) + + # Check that y is binary + if not (type_of_target(y) == 'binary' or + type_of_target(y) == 'multiclass'): + warnings.warn('The target type should be binary or multiclass.') + + return self + def _selection_dist_based(self, X, y, dist_vec, num_samples, key, sel_strategy='nearest'): """Select the appropriate samples depending of the strategy selected. diff --git a/imblearn/under_sampling/neighbourhood_cleaning_rule.py b/imblearn/under_sampling/neighbourhood_cleaning_rule.py index 8542cb621..3ca0b3174 100644 --- a/imblearn/under_sampling/neighbourhood_cleaning_rule.py +++ b/imblearn/under_sampling/neighbourhood_cleaning_rule.py @@ -2,11 +2,14 @@ from __future__ import print_function from __future__ import division +import warnings + import numpy as np from collections import Counter from sklearn.neighbors import NearestNeighbors +from sklearn.utils.multiclass import type_of_target from ..base import SamplerMixin @@ -88,6 +91,33 @@ def __init__(self, return_indices=False, random_state=None, size_ngh=3, self.size_ngh = size_ngh self.n_jobs = n_jobs + def fit(self, X, y): + """Find the classes statistics before to perform sampling. + + Parameters + ---------- + X : ndarray, shape (n_samples, n_features) + Matrix containing the data which have to be sampled. + + y : ndarray, shape (n_samples, ) + Corresponding label for each sample in X. + + Returns + ------- + self : object, + Return self. + + """ + + super(NeighbourhoodCleaningRule, self).fit(X, y) + + # Check that y is binary + if not (type_of_target(y) == 'binary' or + type_of_target(y) == 'multiclass'): + warnings.warn('The target type should be binary or multiclass.') + + return self + def _sample(self, X, y): """Resample the dataset. diff --git a/imblearn/under_sampling/tests/test_nearmiss_1.py b/imblearn/under_sampling/tests/test_nearmiss_1.py index dd11cdcd0..59c2523f5 100644 --- a/imblearn/under_sampling/tests/test_nearmiss_1.py +++ b/imblearn/under_sampling/tests/test_nearmiss_1.py @@ -12,6 +12,8 @@ from sklearn.datasets import make_classification from sklearn.utils.estimator_checks import check_estimator +from collections import Counter + from imblearn.under_sampling import NearMiss # Generate a global dataset to use @@ -204,3 +206,31 @@ def test_nm1_sample_wrong_X(): nm1.fit(X, Y) assert_raises(RuntimeError, nm1.sample, np.random.random((100, 40)), np.array([0] * 50 + [1] * 50)) + + +def test_continuous_error(): + """Test either if an error is raised when the target are continuous + type""" + + # continuous case + y = np.linspace(0, 1, 5000) + nm1 = NearMiss(random_state=RND_SEED, version=VERSION_NEARMISS) + assert_warns(UserWarning, nm1.fit, X, y) + + +def test_multiclass_fit_sample(): + """Test fit sample method with multiclass target""" + + # Make y to be multiclass + y = Y.copy() + y[0:1000] = 2 + + # Resample the data + nm1 = NearMiss(random_state=RND_SEED, version=VERSION_NEARMISS) + X_resampled, y_resampled = nm1.fit_sample(X, y) + + # Check the size of y + count_y_res = Counter(y_resampled) + assert_equal(count_y_res[0], 400) + assert_equal(count_y_res[1], 400) + assert_equal(count_y_res[2], 400) diff --git a/imblearn/under_sampling/tests/test_nearmiss_2.py b/imblearn/under_sampling/tests/test_nearmiss_2.py index 1213e5bd4..b520d3b41 100644 --- a/imblearn/under_sampling/tests/test_nearmiss_2.py +++ b/imblearn/under_sampling/tests/test_nearmiss_2.py @@ -12,6 +12,8 @@ from sklearn.datasets import make_classification from sklearn.utils.estimator_checks import check_estimator +from collections import Counter + from imblearn.under_sampling import NearMiss # Generate a global dataset to use @@ -204,3 +206,31 @@ def test_nm2_sample_wrong_X(): nm2.fit(X, Y) assert_raises(RuntimeError, nm2.sample, np.random.random((100, 40)), np.array([0] * 50 + [1] * 50)) + + +def test_continuous_error(): + """Test either if an error is raised when the target are continuous + type""" + + # continuous case + y = np.linspace(0, 1, 5000) + nm = NearMiss(random_state=RND_SEED, version=VERSION_NEARMISS) + assert_warns(UserWarning, nm.fit, X, y) + + +def test_multiclass_fit_sample(): + """Test fit sample method with multiclass target""" + + # Make y to be multiclass + y = Y.copy() + y[0:1000] = 2 + + # Resample the data + nm = NearMiss(random_state=RND_SEED, version=VERSION_NEARMISS) + X_resampled, y_resampled = nm.fit_sample(X, y) + + # Check the size of y + count_y_res = Counter(y_resampled) + assert_equal(count_y_res[0], 400) + assert_equal(count_y_res[1], 400) + assert_equal(count_y_res[2], 400) diff --git a/imblearn/under_sampling/tests/test_nearmiss_3.py b/imblearn/under_sampling/tests/test_nearmiss_3.py index c83ea50d7..825cd85cc 100644 --- a/imblearn/under_sampling/tests/test_nearmiss_3.py +++ b/imblearn/under_sampling/tests/test_nearmiss_3.py @@ -12,6 +12,8 @@ from sklearn.datasets import make_classification from sklearn.utils.estimator_checks import check_estimator +from collections import Counter + from imblearn.under_sampling import NearMiss # Generate a global dataset to use @@ -204,3 +206,31 @@ def test_nm3_sample_wrong_X(): nm3.fit(X, Y) assert_raises(RuntimeError, nm3.sample, np.random.random((100, 40)), np.array([0] * 50 + [1] * 50)) + + +def test_continuous_error(): + """Test either if an error is raised when the target are continuous + type""" + + # continuous case + y = np.linspace(0, 1, 5000) + nm = NearMiss(random_state=RND_SEED, version=VERSION_NEARMISS) + assert_warns(UserWarning, nm.fit, X, y) + + +def test_multiclass_fit_sample(): + """Test fit sample method with multiclass target""" + + # Make y to be multiclass + y = Y.copy() + y[0:1000] = 2 + + # Resample the data + nm = NearMiss(random_state=RND_SEED, version=VERSION_NEARMISS) + X_resampled, y_resampled = nm.fit_sample(X, y) + + # Check the size of y + count_y_res = Counter(y_resampled) + assert_equal(count_y_res[0], 400) + assert_equal(count_y_res[1], 166) + assert_equal(count_y_res[2], 144) From e1830db96927d4a517895bbb4079d6d4dad92bab Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 27 Jul 2016 15:01:42 +0200 Subject: [PATCH 16/24] Check multiclass for OneSidedSelection --- .../under_sampling/one_sided_selection.py | 32 +++++++++++++++++++ .../tests/test_neighbourhood_cleaning_rule.py | 30 +++++++++++++++++ .../tests/test_one_sided_selection.py | 30 +++++++++++++++++ 3 files changed, 92 insertions(+) diff --git a/imblearn/under_sampling/one_sided_selection.py b/imblearn/under_sampling/one_sided_selection.py index ce6c3ced7..786fe7711 100644 --- a/imblearn/under_sampling/one_sided_selection.py +++ b/imblearn/under_sampling/one_sided_selection.py @@ -2,6 +2,8 @@ from __future__ import print_function from __future__ import division +import warnings + import numpy as np from collections import Counter @@ -9,6 +11,7 @@ from sklearn.neighbors import KNeighborsClassifier from sklearn.neighbors import NearestNeighbors from sklearn.utils import check_random_state +from sklearn.utils.multiclass import type_of_target from ..base import SamplerMixin from .tomek_links import TomekLinks @@ -61,6 +64,8 @@ class OneSidedSelection(SamplerMixin): ----- The method is based on [1]_. + This method support multiclass. + Examples -------- @@ -95,6 +100,33 @@ def __init__(self, return_indices=False, random_state=None, self.n_jobs = n_jobs self.kwargs = kwargs + def fit(self, X, y): + """Find the classes statistics before to perform sampling. + + Parameters + ---------- + X : ndarray, shape (n_samples, n_features) + Matrix containing the data which have to be sampled. + + y : ndarray, shape (n_samples, ) + Corresponding label for each sample in X. + + Returns + ------- + self : object, + Return self. + + """ + + super(OneSidedSelection, self).fit(X, y) + + # Check that y is binary + if not (type_of_target(y) == 'binary' or + type_of_target(y) == 'multiclass'): + warnings.warn('The target type should be binary or multiclass.') + + return self + def _sample(self, X, y): """Resample the dataset. diff --git a/imblearn/under_sampling/tests/test_neighbourhood_cleaning_rule.py b/imblearn/under_sampling/tests/test_neighbourhood_cleaning_rule.py index 9945d2ad8..ea99cfcea 100644 --- a/imblearn/under_sampling/tests/test_neighbourhood_cleaning_rule.py +++ b/imblearn/under_sampling/tests/test_neighbourhood_cleaning_rule.py @@ -12,6 +12,8 @@ from sklearn.datasets import make_classification from sklearn.utils.estimator_checks import check_estimator +from collections import Counter + from imblearn.under_sampling import NeighbourhoodCleaningRule # Generate a global dataset to use @@ -112,3 +114,31 @@ def test_ncr_sample_wrong_X(): ncr.fit(X, Y) assert_raises(RuntimeError, ncr.sample, np.random.random((100, 40)), np.array([0] * 50 + [1] * 50)) + + +def test_continuous_error(): + """Test either if an error is raised when the target are continuous + type""" + + # continuous case + y = np.linspace(0, 1, 5000) + ncr = NeighbourhoodCleaningRule(random_state=RND_SEED) + assert_warns(UserWarning, ncr.fit, X, y) + + +def test_multiclass_fit_sample(): + """Test fit sample method with multiclass target""" + + # Make y to be multiclass + y = Y.copy() + y[0:1000] = 2 + + # Resample the data + ncr = NeighbourhoodCleaningRule(random_state=RND_SEED) + X_resampled, y_resampled = ncr.fit_sample(X, y) + + # Check the size of y + count_y_res = Counter(y_resampled) + assert_equal(count_y_res[0], 400) + assert_equal(count_y_res[1], 2268) + assert_equal(count_y_res[2], 42) diff --git a/imblearn/under_sampling/tests/test_one_sided_selection.py b/imblearn/under_sampling/tests/test_one_sided_selection.py index 7f3112233..32b363c87 100644 --- a/imblearn/under_sampling/tests/test_one_sided_selection.py +++ b/imblearn/under_sampling/tests/test_one_sided_selection.py @@ -12,6 +12,8 @@ from sklearn.datasets import make_classification from sklearn.utils.estimator_checks import check_estimator +from collections import Counter + from imblearn.under_sampling import OneSidedSelection # Generate a global dataset to use @@ -113,3 +115,31 @@ def test_oss_sample_wrong_X(): oss.fit(X, Y) assert_raises(RuntimeError, oss.sample, np.random.random((100, 40)), np.array([0] * 50 + [1] * 50)) + + +def test_continuous_error(): + """Test either if an error is raised when the target are continuous + type""" + + # continuous case + y = np.linspace(0, 1, 5000) + oss = OneSidedSelection(random_state=RND_SEED) + assert_warns(UserWarning, oss.fit, X, y) + + +def test_multiclass_fit_sample(): + """Test fit sample method with multiclass target""" + + # Make y to be multiclass + y = Y.copy() + y[0:1000] = 2 + + # Resample the data + oss = OneSidedSelection(random_state=RND_SEED) + X_resampled, y_resampled = oss.fit_sample(X, y) + + # Check the size of y + count_y_res = Counter(y_resampled) + assert_equal(count_y_res[0], 400) + assert_equal(count_y_res[1], 2410) + assert_equal(count_y_res[2], 715) From ad487e25687eebf8d6103f01f25266e97c37c44f Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 27 Jul 2016 15:05:30 +0200 Subject: [PATCH 17/24] Check multiclass for RandomOverSampler --- .../under_sampling/random_under_sampler.py | 30 +++++++++++++++++++ .../tests/test_random_under_sampler.py | 30 +++++++++++++++++++ 2 files changed, 60 insertions(+) diff --git a/imblearn/under_sampling/random_under_sampler.py b/imblearn/under_sampling/random_under_sampler.py index c41d254bd..874dcca97 100644 --- a/imblearn/under_sampling/random_under_sampler.py +++ b/imblearn/under_sampling/random_under_sampler.py @@ -2,11 +2,14 @@ from __future__ import print_function from __future__ import division +import warnings + import numpy as np from collections import Counter from sklearn.utils import check_random_state +from sklearn.utils.multiclass import type_of_target from ..base import SamplerMixin @@ -82,6 +85,33 @@ def __init__(self, ratio='auto', return_indices=False, random_state=None, self.random_state = random_state self.replacement = replacement + def fit(self, X, y): + """Find the classes statistics before to perform sampling. + + Parameters + ---------- + X : ndarray, shape (n_samples, n_features) + Matrix containing the data which have to be sampled. + + y : ndarray, shape (n_samples, ) + Corresponding label for each sample in X. + + Returns + ------- + self : object, + Return self. + + """ + + super(RandomUnderSampler, self).fit(X, y) + + # Check that y is binary + if not (type_of_target(y) == 'binary' or + type_of_target(y) == 'multiclass'): + warnings.warn('The target type should be binary or multiclass.') + + return self + def _sample(self, X, y): """Resample the dataset. diff --git a/imblearn/under_sampling/tests/test_random_under_sampler.py b/imblearn/under_sampling/tests/test_random_under_sampler.py index cc1e0fc73..2f2f7f5fb 100644 --- a/imblearn/under_sampling/tests/test_random_under_sampler.py +++ b/imblearn/under_sampling/tests/test_random_under_sampler.py @@ -12,6 +12,8 @@ from sklearn.datasets import make_classification from sklearn.utils.estimator_checks import check_estimator +from collections import Counter + from imblearn.under_sampling import RandomUnderSampler # Generate a global dataset to use @@ -162,3 +164,31 @@ def test_rus_sample_wrong_X(): rus.fit(X, Y) assert_raises(RuntimeError, rus.sample, np.random.random((100, 40)), np.array([0] * 50 + [1] * 50)) + + +def test_continuous_error(): + """Test either if an error is raised when the target are continuous + type""" + + # continuous case + y = np.linspace(0, 1, 5000) + rus = RandomUnderSampler(random_state=RND_SEED) + assert_warns(UserWarning, rus.fit, X, y) + + +def test_multiclass_fit_sample(): + """Test fit sample method with multiclass target""" + + # Make y to be multiclass + y = Y.copy() + y[0:1000] = 2 + + # Resample the data + rus = RandomUnderSampler(random_state=RND_SEED) + X_resampled, y_resampled = rus.fit_sample(X, y) + + # Check the size of y + count_y_res = Counter(y_resampled) + assert_equal(count_y_res[0], 400) + assert_equal(count_y_res[1], 400) + assert_equal(count_y_res[2], 400) From 2a243f9115683936a47ee21bc2827e384b86f05d Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 27 Jul 2016 15:07:35 +0200 Subject: [PATCH 18/24] Check multiclass for TomekLinks --- .../under_sampling/tests/test_tomek_links.py | 15 ++++++++++ imblearn/under_sampling/tomek_links.py | 29 +++++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/imblearn/under_sampling/tests/test_tomek_links.py b/imblearn/under_sampling/tests/test_tomek_links.py index e440193e4..721b93fb5 100644 --- a/imblearn/under_sampling/tests/test_tomek_links.py +++ b/imblearn/under_sampling/tests/test_tomek_links.py @@ -111,3 +111,18 @@ def test_tl_sample_wrong_X(): tl.fit(X, Y) assert_raises(RuntimeError, tl.sample, np.random.random((100, 40)), np.array([0] * 50 + [1] * 50)) + + +def test_multiclass_error(): + """ Test either if an error is raised when the target are not binary + type. """ + + # continuous case + y = np.linspace(0, 1, 5000) + tl = TomekLinks(random_state=RND_SEED) + assert_warns(UserWarning, tl.fit, X, y) + + # multiclass case + y = np.array([0] * 2000 + [1] * 2000 + [2] * 1000) + tl = TomekLinks(random_state=RND_SEED) + assert_warns(UserWarning, tl.fit, X, y) diff --git a/imblearn/under_sampling/tomek_links.py b/imblearn/under_sampling/tomek_links.py index 4857f2cd0..95a038ff1 100644 --- a/imblearn/under_sampling/tomek_links.py +++ b/imblearn/under_sampling/tomek_links.py @@ -2,11 +2,14 @@ from __future__ import print_function from __future__ import division +import warnings + import numpy as np from collections import Counter from sklearn.neighbors import NearestNeighbors +from sklearn.utils.multiclass import type_of_target from ..base import SamplerMixin @@ -81,6 +84,32 @@ def __init__(self, return_indices=False, random_state=None, self.random_state = random_state self.n_jobs = n_jobs + def fit(self, X, y): + """Find the classes statistics before to perform sampling. + + Parameters + ---------- + X : ndarray, shape (n_samples, n_features) + Matrix containing the data which have to be sampled. + + y : ndarray, shape (n_samples, ) + Corresponding label for each sample in X. + + Returns + ------- + self : object, + Return self. + + """ + + super(TomekLinks, self).fit(X, y) + + # Check that y is binary + if not type_of_target(y) == 'binary': + warnings.warn('The target type should be binary.') + + return self + @staticmethod def is_tomek(y, nn_index, class_type): """is_tomek uses the target vector and the first neighbour of every From ff52bfcb417368bf14f81620f021c56cdc32d097 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 27 Jul 2016 15:22:55 +0200 Subject: [PATCH 19/24] Avoid testing CNN --- .../tests/test_condensed_nearest_neighbour.py | 70 +++++++++---------- 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/imblearn/under_sampling/tests/test_condensed_nearest_neighbour.py b/imblearn/under_sampling/tests/test_condensed_nearest_neighbour.py index 887a47eda..5c5af1896 100644 --- a/imblearn/under_sampling/tests/test_condensed_nearest_neighbour.py +++ b/imblearn/under_sampling/tests/test_condensed_nearest_neighbour.py @@ -75,34 +75,34 @@ def test_cnn_sample_wt_fit(): assert_raises(RuntimeError, cnn.sample, X, Y) -def test_cnn_fit_sample(): - """Test the fit sample routine""" +# def test_cnn_fit_sample(): +# """Test the fit sample routine""" - # Resample the data - cnn = CondensedNearestNeighbour(random_state=RND_SEED) - X_resampled, y_resampled = cnn.fit_sample(X, Y) +# # Resample the data +# cnn = CondensedNearestNeighbour(random_state=RND_SEED) +# X_resampled, y_resampled = cnn.fit_sample(X, Y) - currdir = os.path.dirname(os.path.abspath(__file__)) - X_gt = np.load(os.path.join(currdir, 'data', 'cnn_x.npy')) - y_gt = np.load(os.path.join(currdir, 'data', 'cnn_y.npy')) - assert_array_equal(X_resampled, X_gt) - assert_array_equal(y_resampled, y_gt) +# currdir = os.path.dirname(os.path.abspath(__file__)) +# X_gt = np.load(os.path.join(currdir, 'data', 'cnn_x.npy')) +# y_gt = np.load(os.path.join(currdir, 'data', 'cnn_y.npy')) +# assert_array_equal(X_resampled, X_gt) +# assert_array_equal(y_resampled, y_gt) -def test_cnn_fit_sample_with_indices(): - """Test the fit sample routine with indices support""" +# def test_cnn_fit_sample_with_indices(): +# """Test the fit sample routine with indices support""" - # Resample the data - cnn = CondensedNearestNeighbour(return_indices=True, random_state=RND_SEED) - X_resampled, y_resampled, idx_under = cnn.fit_sample(X, Y) +# # Resample the data +# cnn = CondensedNearestNeighbour(return_indices=True, random_state=RND_SEED) +# X_resampled, y_resampled, idx_under = cnn.fit_sample(X, Y) - currdir = os.path.dirname(os.path.abspath(__file__)) - X_gt = np.load(os.path.join(currdir, 'data', 'cnn_x.npy')) - y_gt = np.load(os.path.join(currdir, 'data', 'cnn_y.npy')) - idx_gt = np.load(os.path.join(currdir, 'data', 'cnn_idx.npy')) - assert_array_equal(X_resampled, X_gt) - assert_array_equal(y_resampled, y_gt) - assert_array_equal(idx_under, idx_gt) +# currdir = os.path.dirname(os.path.abspath(__file__)) +# X_gt = np.load(os.path.join(currdir, 'data', 'cnn_x.npy')) +# y_gt = np.load(os.path.join(currdir, 'data', 'cnn_y.npy')) +# idx_gt = np.load(os.path.join(currdir, 'data', 'cnn_idx.npy')) +# assert_array_equal(X_resampled, X_gt) +# assert_array_equal(y_resampled, y_gt) +# assert_array_equal(idx_under, idx_gt) def test_cnn_sample_wrong_X(): @@ -126,19 +126,19 @@ def test_continuous_error(): assert_warns(UserWarning, cnn.fit, X, y) -def test_multiclass_fit_sample(): - """Test fit sample method with multiclass target""" +# def test_multiclass_fit_sample(): +# """Test fit sample method with multiclass target""" - # Make y to be multiclass - y = Y.copy() - y[0:1000] = 2 +# # Make y to be multiclass +# y = Y.copy() +# y[0:1000] = 2 - # Resample the data - cnn = CondensedNearestNeighbour(random_state=RND_SEED) - X_resampled, y_resampled = cnn.fit_sample(X, y) +# # Resample the data +# cnn = CondensedNearestNeighbour(random_state=RND_SEED) +# X_resampled, y_resampled = cnn.fit_sample(X, y) - # Check the size of y - count_y_res = Counter(y_resampled) - assert_equal(count_y_res[0], 400) - assert_equal(count_y_res[1], 113) - assert_equal(count_y_res[2], 147) +# # Check the size of y +# count_y_res = Counter(y_resampled) +# assert_equal(count_y_res[0], 400) +# assert_equal(count_y_res[1], 113) +# assert_equal(count_y_res[2], 147) From e7ac93f58f2154b6ea8b77b60a778eee717ad640 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 27 Jul 2016 18:42:33 +0200 Subject: [PATCH 20/24] Add a dictionary for the properties of each sampler --- imblearn/__init__.py | 5 ++ imblearn/base.py | 12 +++- imblearn/combine/smote_enn.py | 10 +-- imblearn/combine/smote_tomek.py | 10 +-- imblearn/ensemble/balance_cascade.py | 32 +-------- imblearn/ensemble/easy_ensemble.py | 34 ++-------- imblearn/ensemble/tests/test_easy_ensemble.py | 27 ++++++-- imblearn/over_sampling/adasyn.py | 31 +-------- imblearn/over_sampling/random_over_sampler.py | 32 +-------- imblearn/over_sampling/smote.py | 28 +------- imblearn/setup.py | 2 + imblearn/under_sampling/cluster_centroids.py | 32 +-------- .../condensed_nearest_neighbour.py | 32 +-------- .../edited_nearest_neighbours.py | 36 ++-------- .../instance_hardness_threshold.py | 31 +-------- imblearn/under_sampling/nearmiss.py | 32 +-------- .../neighbourhood_cleaning_rule.py | 32 +-------- .../under_sampling/one_sided_selection.py | 32 +-------- .../under_sampling/random_under_sampler.py | 32 +-------- imblearn/under_sampling/tomek_links.py | 31 +-------- imblearn/utils/__init__.py | 7 ++ imblearn/utils/tests/test_validation.py | 67 +++++++++++++++++++ imblearn/utils/validation.py | 43 ++++++++++++ 23 files changed, 195 insertions(+), 435 deletions(-) create mode 100644 imblearn/utils/__init__.py create mode 100644 imblearn/utils/tests/test_validation.py create mode 100644 imblearn/utils/validation.py diff --git a/imblearn/__init__.py b/imblearn/__init__.py index 60649b506..abb8690ae 100644 --- a/imblearn/__init__.py +++ b/imblearn/__init__.py @@ -14,6 +14,10 @@ Module which provides methods to under-sample a dataset. under-sampling Module which provides methods to over-sample a dataset. +utils + Module which provides various utilities. +pipeline + Module which allowing to create pipeline with scikit-learn estimators. """ from .version import _check_module_dependencies, __version__ @@ -32,5 +36,6 @@ 'ensemble', 'over_sampling', 'under_sampling', + 'utils', 'pipeline', '__version__'] diff --git a/imblearn/base.py b/imblearn/base.py index c1e626492..634706760 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -18,6 +18,7 @@ from six import string_types +from .utils import check_target_type class SamplerMixin(six.with_metaclass(ABCMeta, BaseEstimator)): @@ -27,7 +28,7 @@ class SamplerMixin(six.with_metaclass(ABCMeta, BaseEstimator)): instead. """ - _estimator_type = "sampler" + _estimator_type = 'sampler' def __init__(self, ratio='auto'): """Initialize this object and its instance variables. @@ -70,6 +71,9 @@ def fit(self, X, y): # Check the consistency of X and y X, y = check_X_y(X, y) + # Check the target type consistency + check_target_type(self, y) + self.min_c_ = None self.maj_c_ = None self.stats_c_ = {} @@ -226,3 +230,9 @@ def __setstate__(self, dict): logger = logging.getLogger(__name__) self.__dict__.update(dict) self.logger = logger + + @classmethod + def get_properties(cls): + """Get the properties for this estimator.""" + + return cls._estimator_prop diff --git a/imblearn/combine/smote_enn.py b/imblearn/combine/smote_enn.py index 257b79eed..8eefc193d 100644 --- a/imblearn/combine/smote_enn.py +++ b/imblearn/combine/smote_enn.py @@ -2,10 +2,6 @@ from __future__ import print_function from __future__ import division -import warnings - -from sklearn.utils.multiclass import type_of_target - from ..over_sampling import SMOTE from ..under_sampling import EditedNearestNeighbours from ..base import SamplerMixin @@ -106,6 +102,8 @@ class SMOTEENN(SamplerMixin): """ + _estimator_prop = {'handles_multiclass': False} + def __init__(self, ratio='auto', random_state=None, k=5, m=10, out_step=0.5, kind_smote='regular', size_ngh=3, kind_enn='all', n_jobs=-1, **kwargs): @@ -149,10 +147,6 @@ def fit(self, X, y): super(SMOTEENN, self).fit(X, y) - # Check that y is binary - if not type_of_target(y) == 'binary': - warnings.warn('The target type should be binary.') - # Fit using SMOTE self.sm.fit(X, y) diff --git a/imblearn/combine/smote_tomek.py b/imblearn/combine/smote_tomek.py index 591e87b5c..5b13dfd91 100644 --- a/imblearn/combine/smote_tomek.py +++ b/imblearn/combine/smote_tomek.py @@ -3,10 +3,6 @@ from __future__ import print_function from __future__ import division -import warnings - -from sklearn.utils.multiclass import type_of_target - from ..over_sampling import SMOTE from ..under_sampling import TomekLinks from ..base import SamplerMixin @@ -107,6 +103,8 @@ class SMOTETomek(SamplerMixin): """ + _estimator_prop = {'handles_multiclass': False} + def __init__(self, ratio='auto', random_state=None, k=5, m=10, out_step=0.5, kind_smote='regular', n_jobs=-1, **kwargs): @@ -144,10 +142,6 @@ def fit(self, X, y): super(SMOTETomek, self).fit(X, y) - # Check that y is binary - if not type_of_target(y) == 'binary': - warnings.warn('The target type should be binary.') - # Fit using SMOTE self.sm.fit(X, y) diff --git a/imblearn/ensemble/balance_cascade.py b/imblearn/ensemble/balance_cascade.py index b73bd40e2..89c3e0ad6 100644 --- a/imblearn/ensemble/balance_cascade.py +++ b/imblearn/ensemble/balance_cascade.py @@ -1,11 +1,8 @@ """Class to perform under-sampling using balace cascade.""" from __future__ import print_function -import warnings - import numpy as np -from sklearn.utils.multiclass import type_of_target from sklearn.utils import check_random_state from ..base import SamplerMixin @@ -103,6 +100,9 @@ class BalanceCascade(SamplerMixin): April 2009. """ + + _estimator_prop = {'handles_multiclass': False} + def __init__(self, ratio='auto', return_indices=False, random_state=None, n_max_subset=None, classifier='knn', bootstrap=True, **kwargs): @@ -114,32 +114,6 @@ def __init__(self, ratio='auto', return_indices=False, random_state=None, self.bootstrap = bootstrap self.kwargs = kwargs - def fit(self, X, y): - """Find the classes statistics before to perform sampling. - - Parameters - ---------- - X : ndarray, shape (n_samples, n_features) - Matrix containing the data which have to be sampled. - - y : ndarray, shape (n_samples, ) - Corresponding label for each sample in X. - - Returns - ------- - self : object, - Return self. - - """ - - super(BalanceCascade, self).fit(X, y) - - # Check that y is binary - if not type_of_target(y) == 'binary': - warnings.warn('The target type should be binary.') - - return self - def _sample(self, X, y): """Resample the dataset. diff --git a/imblearn/ensemble/easy_ensemble.py b/imblearn/ensemble/easy_ensemble.py index fabe95241..a24e34ad5 100644 --- a/imblearn/ensemble/easy_ensemble.py +++ b/imblearn/ensemble/easy_ensemble.py @@ -1,12 +1,8 @@ """Class to perform under-sampling using easy ensemble.""" from __future__ import print_function -import warnings - import numpy as np -from sklearn.utils.multiclass import type_of_target - from ..base import SamplerMixin from ..under_sampling import RandomUnderSampler @@ -60,6 +56,8 @@ class EasyEnsemble(SamplerMixin): ----- The method is described in [1]_. + This method supports multiclass target type. + Examples -------- @@ -86,6 +84,8 @@ class EasyEnsemble(SamplerMixin): """ + _estimator_prop = {'handles_multiclass': True} + def __init__(self, ratio='auto', return_indices=False, random_state=None, replacement=False, n_subsets=10): super(EasyEnsemble, self).__init__(ratio=ratio) @@ -94,32 +94,6 @@ def __init__(self, ratio='auto', return_indices=False, self.replacement = replacement self.n_subsets = n_subsets - def fit(self, X, y): - """Find the classes statistics before to perform sampling. - - Parameters - ---------- - X : ndarray, shape (n_samples, n_features) - Matrix containing the data which have to be sampled. - - y : ndarray, shape (n_samples, ) - Corresponding label for each sample in X. - - Returns - ------- - self : object, - Return self. - - """ - - super(EasyEnsemble, self).fit(X, y) - - # Check that y is binary - if not type_of_target(y) == 'binary': - warnings.warn('The target type should be binary.') - - return self - def _sample(self, X, y): """Resample the dataset. diff --git a/imblearn/ensemble/tests/test_easy_ensemble.py b/imblearn/ensemble/tests/test_easy_ensemble.py index e3cb0a9d8..98fb1318a 100644 --- a/imblearn/ensemble/tests/test_easy_ensemble.py +++ b/imblearn/ensemble/tests/test_easy_ensemble.py @@ -12,6 +12,8 @@ from sklearn.datasets import make_classification from sklearn.utils.estimator_checks import check_estimator +from collections import Counter + from imblearn.ensemble import EasyEnsemble # Generate a global dataset to use @@ -172,16 +174,29 @@ def test_sample_wrong_X(): np.array([0] * 50 + [1] * 50)) -def test_multiclass_error(): - """ Test either if an error is raised when the target are not binary - type. """ +def test_continuous_error(): + """Test either if an error is raised when the target are continuous + type""" # continuous case y = np.linspace(0, 1, 5000) ee = EasyEnsemble(random_state=RND_SEED) assert_warns(UserWarning, ee.fit, X, y) - # multiclass case - y = np.array([0] * 2000 + [1] * 2000 + [2] * 1000) + +def test_multiclass_fit_sample(): + """Test fit sample method with multiclass target""" + + # Make y to be multiclass + y = Y.copy() + y[0:1000] = 2 + + # Resample the data ee = EasyEnsemble(random_state=RND_SEED) - assert_warns(UserWarning, ee.fit, X, y) + X_resampled, y_resampled = ee.fit_sample(X, y) + + # Check the size of y + count_y_res = Counter(y_resampled[0]) + assert_equal(count_y_res[0], 400) + assert_equal(count_y_res[1], 400) + assert_equal(count_y_res[2], 400) diff --git a/imblearn/over_sampling/adasyn.py b/imblearn/over_sampling/adasyn.py index ce9b14907..9326b3989 100644 --- a/imblearn/over_sampling/adasyn.py +++ b/imblearn/over_sampling/adasyn.py @@ -2,15 +2,12 @@ from __future__ import print_function from __future__ import division -import warnings - import numpy as np from collections import Counter from sklearn.neighbors import NearestNeighbors from sklearn.utils import check_random_state -from sklearn.utils.multiclass import type_of_target from ..base import SamplerMixin @@ -89,6 +86,8 @@ class ADASYN(SamplerMixin): """ + _estimator_prop = {'handles_multiclass': False} + def __init__(self, ratio='auto', random_state=None, @@ -101,32 +100,6 @@ def __init__(self, self.nearest_neighbour = NearestNeighbors(n_neighbors=self.k + 1, n_jobs=self.n_jobs) - def fit(self, X, y): - """Find the classes statistics before to perform sampling. - - Parameters - ---------- - X : ndarray, shape (n_samples, n_features) - Matrix containing the data which have to be sampled. - - y : ndarray, shape (n_samples, ) - Corresponding label for each sample in X. - - Returns - ------- - self : object, - Return self. - - """ - - super(ADASYN, self).fit(X, y) - - # Check that y is binary - if not type_of_target(y) == 'binary': - warnings.warn('The target type should be binary.') - - return self - def _sample(self, X, y): """Resample the dataset. diff --git a/imblearn/over_sampling/random_over_sampler.py b/imblearn/over_sampling/random_over_sampler.py index eb998be87..81547d73c 100644 --- a/imblearn/over_sampling/random_over_sampler.py +++ b/imblearn/over_sampling/random_over_sampler.py @@ -2,14 +2,11 @@ from __future__ import print_function from __future__ import division -import warnings - import numpy as np from collections import Counter from sklearn.utils import check_random_state -from sklearn.utils.multiclass import type_of_target from ..base import SamplerMixin @@ -73,6 +70,8 @@ class RandomOverSampler(SamplerMixin): """ + _estimator_prop = {'handles_multiclass': True} + def __init__(self, ratio='auto', random_state=None): @@ -80,33 +79,6 @@ def __init__(self, super(RandomOverSampler, self).__init__(ratio=ratio) self.random_state = random_state - def fit(self, X, y): - """Find the classes statistics before to perform sampling. - - Parameters - ---------- - X : ndarray, shape (n_samples, n_features) - Matrix containing the data which have to be sampled. - - y : ndarray, shape (n_samples, ) - Corresponding label for each sample in X. - - Returns - ------- - self : object, - Return self. - - """ - - super(RandomOverSampler, self).fit(X, y) - - # Check that y is binary - if not (type_of_target(y) == 'binary' or - type_of_target(y) == 'multiclass'): - warnings.warn('The target type should be binary or multiclass.') - - return self - def _sample(self, X, y): """Resample the dataset. diff --git a/imblearn/over_sampling/smote.py b/imblearn/over_sampling/smote.py index 57020fa3c..b34454336 100644 --- a/imblearn/over_sampling/smote.py +++ b/imblearn/over_sampling/smote.py @@ -109,6 +109,8 @@ class SMOTE(SamplerMixin): """ + _estimator_prop = {'handles_multiclass': False} + def __init__(self, ratio='auto', random_state=None, @@ -127,32 +129,6 @@ def __init__(self, self.n_jobs = n_jobs self.kwargs = kwargs - def fit(self, X, y): - """Find the classes statistics before to perform sampling. - - Parameters - ---------- - X : ndarray, shape (n_samples, n_features) - Matrix containing the data which have to be sampled. - - y : ndarray, shape (n_samples, ) - Corresponding label for each sample in X. - - Returns - ------- - self : object, - Return self. - - """ - - super(SMOTE, self).fit(X, y) - - # Check that y is binary - if not type_of_target(y) == 'binary': - warnings.warn('The target type should be binary.') - - return self - def _in_danger_noise(self, samples, y, kind='danger'): """Estimate if a set of sample are in danger or noise. diff --git a/imblearn/setup.py b/imblearn/setup.py index 70f919303..9ae8d0771 100644 --- a/imblearn/setup.py +++ b/imblearn/setup.py @@ -11,6 +11,8 @@ def configuration(parent_package='', top_path=None): config.add_subpackage('over_sampling/tests') config.add_subpackage('under_sampling') config.add_subpackage('under_sampling/tests') + config.add_subpackage('utils') + config.add_subpackage('utils/tests') config.add_subpackage('tests') diff --git a/imblearn/under_sampling/cluster_centroids.py b/imblearn/under_sampling/cluster_centroids.py index d77132f75..3757883a3 100644 --- a/imblearn/under_sampling/cluster_centroids.py +++ b/imblearn/under_sampling/cluster_centroids.py @@ -3,15 +3,12 @@ from __future__ import print_function from __future__ import division -import warnings - import numpy as np from collections import Counter from sklearn.cluster import KMeans from sklearn.utils import check_random_state -from sklearn.utils.multiclass import type_of_target from ..base import SamplerMixin @@ -84,39 +81,14 @@ class ClusterCentroids(SamplerMixin): """ + _estimator_prop = {'handles_multiclass': True} + def __init__(self, ratio='auto', random_state=None, n_jobs=-1, **kwargs): super(ClusterCentroids, self).__init__(ratio=ratio) self.random_state = random_state self.n_jobs = n_jobs self.kwargs = kwargs - def fit(self, X, y): - """Find the classes statistics before to perform sampling. - - Parameters - ---------- - X : ndarray, shape (n_samples, n_features) - Matrix containing the data which have to be sampled. - - y : ndarray, shape (n_samples, ) - Corresponding label for each sample in X. - - Returns - ------- - self : object, - Return self. - - """ - - super(ClusterCentroids, self).fit(X, y) - - # Check that y is binary - if not (type_of_target(y) == 'binary' or - type_of_target(y) == 'multiclass'): - warnings.warn('The target type should be binary or multiclass.') - - return self - def _sample(self, X, y): """Resample the dataset. diff --git a/imblearn/under_sampling/condensed_nearest_neighbour.py b/imblearn/under_sampling/condensed_nearest_neighbour.py index eaa171d6a..1a8b43b2a 100644 --- a/imblearn/under_sampling/condensed_nearest_neighbour.py +++ b/imblearn/under_sampling/condensed_nearest_neighbour.py @@ -3,14 +3,11 @@ from __future__ import print_function from __future__ import division -import warnings - import numpy as np from collections import Counter from sklearn.utils import check_random_state -from sklearn.utils.multiclass import type_of_target from sklearn.neighbors import KNeighborsClassifier from ..base import SamplerMixin @@ -90,6 +87,8 @@ class CondensedNearestNeighbour(SamplerMixin): """ + _estimator_prop = {'handles_multiclass': True} + def __init__(self, return_indices=False, random_state=None, size_ngh=1, n_seeds_S=1, n_jobs=-1, **kwargs): super(CondensedNearestNeighbour, self).__init__() @@ -101,33 +100,6 @@ def __init__(self, return_indices=False, random_state=None, self.n_jobs = n_jobs self.kwargs = kwargs - def fit(self, X, y): - """Find the classes statistics before to perform sampling. - - Parameters - ---------- - X : ndarray, shape (n_samples, n_features) - Matrix containing the data which have to be sampled. - - y : ndarray, shape (n_samples, ) - Corresponding label for each sample in X. - - Returns - ------- - self : object, - Return self. - - """ - - super(CondensedNearestNeighbour, self).fit(X, y) - - # Check that y is binary - if not (type_of_target(y) == 'binary' or - type_of_target(y) == 'multiclass'): - warnings.warn('The target type should be binary or multiclass.') - - return self - def _sample(self, X, y): """Resample the dataset. diff --git a/imblearn/under_sampling/edited_nearest_neighbours.py b/imblearn/under_sampling/edited_nearest_neighbours.py index 04859e5a2..fe16c4d3b 100644 --- a/imblearn/under_sampling/edited_nearest_neighbours.py +++ b/imblearn/under_sampling/edited_nearest_neighbours.py @@ -3,8 +3,6 @@ from __future__ import print_function from __future__ import division -import warnings - import numpy as np from collections import Counter @@ -12,7 +10,6 @@ from scipy.stats import mode from sklearn.neighbors import NearestNeighbors -from sklearn.utils.multiclass import type_of_target from ..base import SamplerMixin @@ -97,6 +94,8 @@ class EditedNearestNeighbours(SamplerMixin): """ + _estimator_prop = {'handles_multiclass': True} + def __init__(self, return_indices=False, random_state=None, size_ngh=3, kind_sel='all', n_jobs=-1): super(EditedNearestNeighbours, self).__init__() @@ -106,33 +105,6 @@ def __init__(self, return_indices=False, random_state=None, self.kind_sel = kind_sel self.n_jobs = n_jobs - def fit(self, X, y): - """Find the classes statistics before to perform sampling. - - Parameters - ---------- - X : ndarray, shape (n_samples, n_features) - Matrix containing the data which have to be sampled. - - y : ndarray, shape (n_samples, ) - Corresponding label for each sample in X. - - Returns - ------- - self : object, - Return self. - - """ - - super(EditedNearestNeighbours, self).fit(X, y) - - # Check that y is binary - if not (type_of_target(y) == 'binary' or - type_of_target(y) == 'multiclass'): - warnings.warn('The target type should be binary or multiclass.') - - return self - def _sample(self, X, y): """Resample the dataset. @@ -312,6 +284,8 @@ class RepeatedEditedNearestNeighbours(SamplerMixin): """ + _estimator_prop = {'handles_multiclass': True} + def __init__(self, return_indices=False, random_state=None, size_ngh=3, max_iter=100, kind_sel='all', n_jobs=-1): super(RepeatedEditedNearestNeighbours, self).__init__() @@ -490,6 +464,8 @@ class AllKNN(SamplerMixin): """ + _estimator_prop = {'handles_multiclass': True} + def __init__(self, return_indices=False, random_state=None, size_ngh=3, kind_sel='all', n_jobs=-1): super(AllKNN, self).__init__() diff --git a/imblearn/under_sampling/instance_hardness_threshold.py b/imblearn/under_sampling/instance_hardness_threshold.py index 6c258c753..9bb4bda43 100644 --- a/imblearn/under_sampling/instance_hardness_threshold.py +++ b/imblearn/under_sampling/instance_hardness_threshold.py @@ -3,14 +3,11 @@ from __future__ import print_function from __future__ import division -import warnings - import numpy as np from collections import Counter from sklearn.cross_validation import StratifiedKFold -from sklearn.utils.multiclass import type_of_target from ..base import SamplerMixin @@ -102,6 +99,8 @@ class InstanceHardnessThreshold(SamplerMixin): """ + _estimator_prop = {'handles_multiclass': False} + def __init__(self, estimator='linear-svm', ratio='auto', return_indices=False, random_state=None, cv=5, n_jobs=-1, **kwargs): @@ -113,32 +112,6 @@ def __init__(self, estimator='linear-svm', ratio='auto', self.cv = cv self.n_jobs = n_jobs - def fit(self, X, y): - """Find the classes statistics before to perform sampling. - - Parameters - ---------- - X : ndarray, shape (n_samples, n_features) - Matrix containing the data which have to be sampled. - - y : ndarray, shape (n_samples, ) - Corresponding label for each sample in X. - - Returns - ------- - self : object, - Return self. - - """ - - super(InstanceHardnessThreshold, self).fit(X, y) - - # Check that y is binary - if not type_of_target(y) == 'binary': - warnings.warn('The target type should be binary.') - - return self - def _sample(self, X, y): """Resample the dataset. diff --git a/imblearn/under_sampling/nearmiss.py b/imblearn/under_sampling/nearmiss.py index 31b1653c8..a7618554f 100644 --- a/imblearn/under_sampling/nearmiss.py +++ b/imblearn/under_sampling/nearmiss.py @@ -2,14 +2,11 @@ from __future__ import print_function from __future__ import division -import warnings - import numpy as np from collections import Counter from sklearn.neighbors import NearestNeighbors -from sklearn.utils.multiclass import type_of_target from ..base import SamplerMixin @@ -100,6 +97,8 @@ class NearMiss(SamplerMixin): """ + _estimator_prop = {'handles_multiclass': True} + def __init__(self, ratio='auto', return_indices=False, random_state=None, version=1, size_ngh=3, ver3_samp_ngh=3, n_jobs=-1, **kwargs): super(NearMiss, self).__init__(ratio=ratio) @@ -111,33 +110,6 @@ def __init__(self, ratio='auto', return_indices=False, random_state=None, self.n_jobs = n_jobs self.kwargs = kwargs - def fit(self, X, y): - """Find the classes statistics before to perform sampling. - - Parameters - ---------- - X : ndarray, shape (n_samples, n_features) - Matrix containing the data which have to be sampled. - - y : ndarray, shape (n_samples, ) - Corresponding label for each sample in X. - - Returns - ------- - self : object, - Return self. - - """ - - super(NearMiss, self).fit(X, y) - - # Check that y is binary - if not (type_of_target(y) == 'binary' or - type_of_target(y) == 'multiclass'): - warnings.warn('The target type should be binary or multiclass.') - - return self - def _selection_dist_based(self, X, y, dist_vec, num_samples, key, sel_strategy='nearest'): """Select the appropriate samples depending of the strategy selected. diff --git a/imblearn/under_sampling/neighbourhood_cleaning_rule.py b/imblearn/under_sampling/neighbourhood_cleaning_rule.py index 3ca0b3174..0b45a1318 100644 --- a/imblearn/under_sampling/neighbourhood_cleaning_rule.py +++ b/imblearn/under_sampling/neighbourhood_cleaning_rule.py @@ -2,14 +2,11 @@ from __future__ import print_function from __future__ import division -import warnings - import numpy as np from collections import Counter from sklearn.neighbors import NearestNeighbors -from sklearn.utils.multiclass import type_of_target from ..base import SamplerMixin @@ -83,6 +80,8 @@ class NeighbourhoodCleaningRule(SamplerMixin): """ + _estimator_prop = {'handles_multiclass': True} + def __init__(self, return_indices=False, random_state=None, size_ngh=3, n_jobs=-1): super(NeighbourhoodCleaningRule, self).__init__() @@ -91,33 +90,6 @@ def __init__(self, return_indices=False, random_state=None, size_ngh=3, self.size_ngh = size_ngh self.n_jobs = n_jobs - def fit(self, X, y): - """Find the classes statistics before to perform sampling. - - Parameters - ---------- - X : ndarray, shape (n_samples, n_features) - Matrix containing the data which have to be sampled. - - y : ndarray, shape (n_samples, ) - Corresponding label for each sample in X. - - Returns - ------- - self : object, - Return self. - - """ - - super(NeighbourhoodCleaningRule, self).fit(X, y) - - # Check that y is binary - if not (type_of_target(y) == 'binary' or - type_of_target(y) == 'multiclass'): - warnings.warn('The target type should be binary or multiclass.') - - return self - def _sample(self, X, y): """Resample the dataset. diff --git a/imblearn/under_sampling/one_sided_selection.py b/imblearn/under_sampling/one_sided_selection.py index 786fe7711..2ceabe97f 100644 --- a/imblearn/under_sampling/one_sided_selection.py +++ b/imblearn/under_sampling/one_sided_selection.py @@ -2,8 +2,6 @@ from __future__ import print_function from __future__ import division -import warnings - import numpy as np from collections import Counter @@ -11,7 +9,6 @@ from sklearn.neighbors import KNeighborsClassifier from sklearn.neighbors import NearestNeighbors from sklearn.utils import check_random_state -from sklearn.utils.multiclass import type_of_target from ..base import SamplerMixin from .tomek_links import TomekLinks @@ -90,6 +87,8 @@ class OneSidedSelection(SamplerMixin): """ + _estimator_prop = {'handles_multiclass': True} + def __init__(self, return_indices=False, random_state=None, size_ngh=1, n_seeds_S=1, n_jobs=-1, **kwargs): super(OneSidedSelection, self).__init__() @@ -100,33 +99,6 @@ def __init__(self, return_indices=False, random_state=None, self.n_jobs = n_jobs self.kwargs = kwargs - def fit(self, X, y): - """Find the classes statistics before to perform sampling. - - Parameters - ---------- - X : ndarray, shape (n_samples, n_features) - Matrix containing the data which have to be sampled. - - y : ndarray, shape (n_samples, ) - Corresponding label for each sample in X. - - Returns - ------- - self : object, - Return self. - - """ - - super(OneSidedSelection, self).fit(X, y) - - # Check that y is binary - if not (type_of_target(y) == 'binary' or - type_of_target(y) == 'multiclass'): - warnings.warn('The target type should be binary or multiclass.') - - return self - def _sample(self, X, y): """Resample the dataset. diff --git a/imblearn/under_sampling/random_under_sampler.py b/imblearn/under_sampling/random_under_sampler.py index 874dcca97..a0a23915a 100644 --- a/imblearn/under_sampling/random_under_sampler.py +++ b/imblearn/under_sampling/random_under_sampler.py @@ -2,14 +2,11 @@ from __future__ import print_function from __future__ import division -import warnings - import numpy as np from collections import Counter from sklearn.utils import check_random_state -from sklearn.utils.multiclass import type_of_target from ..base import SamplerMixin @@ -78,6 +75,8 @@ class RandomUnderSampler(SamplerMixin): """ + _estimator_prop = {'handles_multiclass': True} + def __init__(self, ratio='auto', return_indices=False, random_state=None, replacement=True): super(RandomUnderSampler, self).__init__(ratio=ratio) @@ -85,33 +84,6 @@ def __init__(self, ratio='auto', return_indices=False, random_state=None, self.random_state = random_state self.replacement = replacement - def fit(self, X, y): - """Find the classes statistics before to perform sampling. - - Parameters - ---------- - X : ndarray, shape (n_samples, n_features) - Matrix containing the data which have to be sampled. - - y : ndarray, shape (n_samples, ) - Corresponding label for each sample in X. - - Returns - ------- - self : object, - Return self. - - """ - - super(RandomUnderSampler, self).fit(X, y) - - # Check that y is binary - if not (type_of_target(y) == 'binary' or - type_of_target(y) == 'multiclass'): - warnings.warn('The target type should be binary or multiclass.') - - return self - def _sample(self, X, y): """Resample the dataset. diff --git a/imblearn/under_sampling/tomek_links.py b/imblearn/under_sampling/tomek_links.py index 95a038ff1..a07b44ce6 100644 --- a/imblearn/under_sampling/tomek_links.py +++ b/imblearn/under_sampling/tomek_links.py @@ -2,14 +2,11 @@ from __future__ import print_function from __future__ import division -import warnings - import numpy as np from collections import Counter from sklearn.neighbors import NearestNeighbors -from sklearn.utils.multiclass import type_of_target from ..base import SamplerMixin @@ -77,6 +74,8 @@ class TomekLinks(SamplerMixin): """ + _estimator_prop = {'handles_multiclass': False} + def __init__(self, return_indices=False, random_state=None, n_jobs=-1): super(TomekLinks, self).__init__() @@ -84,32 +83,6 @@ def __init__(self, return_indices=False, random_state=None, self.random_state = random_state self.n_jobs = n_jobs - def fit(self, X, y): - """Find the classes statistics before to perform sampling. - - Parameters - ---------- - X : ndarray, shape (n_samples, n_features) - Matrix containing the data which have to be sampled. - - y : ndarray, shape (n_samples, ) - Corresponding label for each sample in X. - - Returns - ------- - self : object, - Return self. - - """ - - super(TomekLinks, self).fit(X, y) - - # Check that y is binary - if not type_of_target(y) == 'binary': - warnings.warn('The target type should be binary.') - - return self - @staticmethod def is_tomek(y, nn_index, class_type): """is_tomek uses the target vector and the first neighbour of every diff --git a/imblearn/utils/__init__.py b/imblearn/utils/__init__.py new file mode 100644 index 000000000..f47c5235c --- /dev/null +++ b/imblearn/utils/__init__.py @@ -0,0 +1,7 @@ +""" +The :mod:`imblearn.utils` module includes various utilities. +""" + +from .validation import check_target_type + +__all__ = ['check_target_type'] diff --git a/imblearn/utils/tests/test_validation.py b/imblearn/utils/tests/test_validation.py new file mode 100644 index 000000000..07f6afe08 --- /dev/null +++ b/imblearn/utils/tests/test_validation.py @@ -0,0 +1,67 @@ +"""Tests for input validation functions""" + +from collections import Counter + +import numpy as np +from numpy.testing import assert_raises +from numpy.testing import assert_warns +from numpy.testing import assert_equal + +from sklearn.datasets import make_classification +from sklearn.ensemble import AdaBoostClassifier + +from imblearn.under_sampling import RandomUnderSampler +from imblearn.over_sampling import SMOTE +from imblearn.utils import check_target_type + +# Generate a global dataset to use +RND_SEED = 0 +X, Y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], + n_informative=3, n_redundant=1, flip_y=0, + n_features=20, n_clusters_per_class=1, + n_samples=5000, random_state=RND_SEED) + + +def test_check_target_type(): + """Test to check the target type function""" + + # Check that an error is raised when non estimator are passed + assert_raises(TypeError, check_target_type, 'Something', np.ones((100, 1))) + + # Check that an error is raised when an estimator is passed but not a + # sampler + assert_raises(TypeError, check_target_type, AdaBoostClassifier(), + np.ones((100, 1))) + + # Binary sampler case + + # continuous case + y = np.linspace(0, 1, 5000) + sm = SMOTE(random_state=RND_SEED) + assert_warns(UserWarning, sm.fit, X, y) + + # multiclass case + y = np.array([0] * 2000 + [1] * 2000 + [2] * 1000) + sm = SMOTE(random_state=RND_SEED) + assert_warns(UserWarning, sm.fit, X, y) + + # Multiclass sampler case + + # continuous case + y = np.linspace(0, 1, 5000) + rus = RandomUnderSampler(random_state=RND_SEED) + assert_warns(UserWarning, rus.fit, X, y) + + # Make y to be multiclass + y = Y.copy() + y[0:1000] = 2 + + # Resample the data + rus = RandomUnderSampler(random_state=RND_SEED) + X_resampled, y_resampled = rus.fit_sample(X, y) + + # Check the size of y + count_y_res = Counter(y_resampled) + assert_equal(count_y_res[0], 400) + assert_equal(count_y_res[1], 400) + assert_equal(count_y_res[2], 400) diff --git a/imblearn/utils/validation.py b/imblearn/utils/validation.py new file mode 100644 index 000000000..5a3a638e2 --- /dev/null +++ b/imblearn/utils/validation.py @@ -0,0 +1,43 @@ +"""Utilities for input validation""" + +import warnings + +from sklearn.utils.multiclass import type_of_target + +def check_target_type(estimator, y): + """Check that the estimators handle the target type provided. + + Checks which type of target is provided and if the estimator can handle + such type. + + Parameters + ---------- + estimator : estimator instance. + Estimator instance for which the check is performed. + + y : ndarray, shape (n_samples, ) + Target vector which need to be checked. + + Returns + ------- + None + + """ + + if not hasattr(estimator, 'fit'): + raise TypeError("%s is not an estimator instance." % (estimator)) + + if not estimator._estimator_type == 'sampler': + raise TypeError("%s is not a sampler instance." % (estimator)) + + # In the case that the estimator should handle multiclass + if estimator.get_properties()['handles_multiclass']: + if not (type_of_target(y) == 'binary' or + type_of_target(y) == 'multiclass'): + warnings.warn('The target type should be binary or multiclass.') + # In the case that the estimator is only handling binary class + else: + if not type_of_target(y) == 'binary': + warnings.warn('The target type should be binary.') + + return None From 60d8dbcaafb69712716938f1858917db5bb9210a Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 27 Jul 2016 18:55:32 +0200 Subject: [PATCH 21/24] Update the doc --- doc/api.rst | 19 +++++++++++++++++++ doc/whats_new.rst | 17 +++++++++++++++-- 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/doc/api.rst b/doc/api.rst index db705ebf6..073738aaa 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -117,3 +117,22 @@ Functions :toctree: generated/ pipeline.make_pipeline + + +.. _utils_ref: + +Utility +======= + +.. automodule:: imblearn.utils + :no-members: + :no-inherited-members: + +.. currentmodule:: imblearn + +Functions +--------- +.. autosummary:: + :toctree: generated/ + + utils.check_target_type diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 0b87638eb..35b549ac3 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -12,15 +12,28 @@ Version 0.2 Changelog --------- -- Added support for bumpversion. -- Added doctest in the documentation. +New features +~~~~~~~~~~~~ + - Added AllKNN under sampling technique. +API changes summary +~~~~~~~~~~~~~~~~~~~ + +- A module :mod:`utils` has been created. It provides input checking functions. +- The :class:`base.SamplerMixin` class has a new function `get_properties` in order to know the properties of each classifier. `_estimator_prop` is a dictionary is returned by this function. + Enhancement ~~~~~~~~~~~ +- Added support for bumpversion. - Validate the type of target in binary samplers. A warning is raised for the moment. +Documentation changes +~~~~~~~~~~~~~~~~~~~~~ + +- Added doctest in the documentation. + .. _changes_0_1: Version 0.1 From 5fdc123b78f8a8183f5ac0933b547b74a3a9ee40 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 27 Jul 2016 20:20:32 +0200 Subject: [PATCH 22/24] Replace tags approach with base classes --- doc/api.rst | 18 --- doc/whats_new.rst | 3 +- imblearn/__init__.py | 3 - imblearn/base.py | 125 ++++++++++++++++-- imblearn/combine/smote_enn.py | 6 +- imblearn/combine/smote_tomek.py | 6 +- imblearn/ensemble/balance_cascade.py | 6 +- imblearn/ensemble/easy_ensemble.py | 6 +- imblearn/over_sampling/adasyn.py | 6 +- imblearn/over_sampling/random_over_sampler.py | 6 +- imblearn/over_sampling/smote.py | 6 +- imblearn/setup.py | 2 - imblearn/under_sampling/cluster_centroids.py | 6 +- .../condensed_nearest_neighbour.py | 6 +- .../edited_nearest_neighbours.py | 14 +- .../instance_hardness_threshold.py | 6 +- imblearn/under_sampling/nearmiss.py | 6 +- .../neighbourhood_cleaning_rule.py | 6 +- .../under_sampling/one_sided_selection.py | 6 +- .../under_sampling/random_under_sampler.py | 6 +- .../tests/test_one_sided_selection.py | 25 +--- imblearn/under_sampling/tomek_links.py | 6 +- imblearn/utils/__init__.py | 7 - imblearn/utils/tests/test_validation.py | 67 ---------- imblearn/utils/validation.py | 43 ------ 25 files changed, 157 insertions(+), 240 deletions(-) delete mode 100644 imblearn/utils/__init__.py delete mode 100644 imblearn/utils/tests/test_validation.py delete mode 100644 imblearn/utils/validation.py diff --git a/doc/api.rst b/doc/api.rst index 073738aaa..bd41a6d4c 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -118,21 +118,3 @@ Functions pipeline.make_pipeline - -.. _utils_ref: - -Utility -======= - -.. automodule:: imblearn.utils - :no-members: - :no-inherited-members: - -.. currentmodule:: imblearn - -Functions ---------- -.. autosummary:: - :toctree: generated/ - - utils.check_target_type diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 35b549ac3..c37c88964 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -20,8 +20,7 @@ New features API changes summary ~~~~~~~~~~~~~~~~~~~ -- A module :mod:`utils` has been created. It provides input checking functions. -- The :class:`base.SamplerMixin` class has a new function `get_properties` in order to know the properties of each classifier. `_estimator_prop` is a dictionary is returned by this function. +- Two base classes :class:`BaseBinaryclassSampler` and :class:`BaseMulticlassSampler` have been created to handle the target type and raise warning in case of abnormality. Enhancement ~~~~~~~~~~~ diff --git a/imblearn/__init__.py b/imblearn/__init__.py index abb8690ae..3a326fd15 100644 --- a/imblearn/__init__.py +++ b/imblearn/__init__.py @@ -14,8 +14,6 @@ Module which provides methods to under-sample a dataset. under-sampling Module which provides methods to over-sample a dataset. -utils - Module which provides various utilities. pipeline Module which allowing to create pipeline with scikit-learn estimators. """ @@ -36,6 +34,5 @@ 'ensemble', 'over_sampling', 'under_sampling', - 'utils', 'pipeline', '__version__'] diff --git a/imblearn/base.py b/imblearn/base.py index 634706760..46d435234 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -14,12 +14,11 @@ from sklearn.base import BaseEstimator from sklearn.utils import check_X_y +from sklearn.utils.multiclass import type_of_target from sklearn.externals import six from six import string_types -from .utils import check_target_type - class SamplerMixin(six.with_metaclass(ABCMeta, BaseEstimator)): """Mixin class for samplers with abstact method. @@ -71,9 +70,6 @@ def fit(self, X, y): # Check the consistency of X and y X, y = check_X_y(X, y) - # Check the target type consistency - check_target_type(self, y) - self.min_c_ = None self.maj_c_ = None self.stats_c_ = {} @@ -231,8 +227,119 @@ def __setstate__(self, dict): self.__dict__.update(dict) self.logger = logger - @classmethod - def get_properties(cls): - """Get the properties for this estimator.""" - return cls._estimator_prop +class BaseBinaryclassSampler(six.with_metaclass(ABCMeta, SamplerMixin)): + """Base class for all binary class sampler. + + Warning: This class should not be used directly. Use derived classes + instead. + + """ + + def fit(self, X, y): + """Find the classes statistics before to perform sampling. + + Parameters + ---------- + X : ndarray, shape (n_samples, n_features) + Matrix containing the data which have to be sampled. + + y : ndarray, shape (n_samples, ) + Corresponding label for each sample in X. + + Returns + ------- + self : object, + Return self. + + """ + + super(BaseBinaryclassSampler, self).fit(X, y) + + # Check that the target type is binary + if not type_of_target(y) == 'binary': + warnings.warn('The target type should be binary.') + + return self + + + @abstractmethod + def _sample(self, X, y): + """Resample the dataset. + + Parameters + ---------- + X : ndarray, shape (n_samples, n_features) + Matrix containing the data which have to be sampled. + + y : ndarray, shape (n_samples, ) + Corresponding label for each sample in X. + + Returns + ------- + X_resampled : ndarray, shape (n_samples_new, n_features) + The array containing the resampled data. + + y_resampled : ndarray, shape (n_samples_new) + The corresponding label of `X_resampled` + """ + pass + + +class BaseMulticlassSampler(six.with_metaclass(ABCMeta, SamplerMixin)): + """Base class for all multiclass sampler. + + Warning: This class should not be used directly. Use derived classes + instead. + + """ + + def fit(self, X, y): + """Find the classes statistics before to perform sampling. + + Parameters + ---------- + X : ndarray, shape (n_samples, n_features) + Matrix containing the data which have to be sampled. + + y : ndarray, shape (n_samples, ) + Corresponding label for each sample in X. + + Returns + ------- + self : object, + Return self. + + """ + + super(BaseMulticlassSampler, self).fit(X, y) + + # Check that the target type is either binary or multiclass + if not (type_of_target(y) == 'binary' or + type_of_target(y) == 'multiclass'): + warnings.warn('The target type should be binary or multiclass.') + + return self + + + @abstractmethod + def _sample(self, X, y): + """Resample the dataset. + + Parameters + ---------- + X : ndarray, shape (n_samples, n_features) + Matrix containing the data which have to be sampled. + + y : ndarray, shape (n_samples, ) + Corresponding label for each sample in X. + + Returns + ------- + X_resampled : ndarray, shape (n_samples_new, n_features) + The array containing the resampled data. + + y_resampled : ndarray, shape (n_samples_new) + The corresponding label of `X_resampled` + """ + pass diff --git a/imblearn/combine/smote_enn.py b/imblearn/combine/smote_enn.py index 8eefc193d..a72329efa 100644 --- a/imblearn/combine/smote_enn.py +++ b/imblearn/combine/smote_enn.py @@ -4,10 +4,10 @@ from ..over_sampling import SMOTE from ..under_sampling import EditedNearestNeighbours -from ..base import SamplerMixin +from ..base import BaseBinaryclassSampler -class SMOTEENN(SamplerMixin): +class SMOTEENN(BaseBinaryclassSampler): """Class to perform over-sampling using SMOTE and cleaning using ENN. Combine over- and under-sampling using SMOTE and Edited Nearest Neighbours. @@ -102,8 +102,6 @@ class SMOTEENN(SamplerMixin): """ - _estimator_prop = {'handles_multiclass': False} - def __init__(self, ratio='auto', random_state=None, k=5, m=10, out_step=0.5, kind_smote='regular', size_ngh=3, kind_enn='all', n_jobs=-1, **kwargs): diff --git a/imblearn/combine/smote_tomek.py b/imblearn/combine/smote_tomek.py index 5b13dfd91..d1061bc3f 100644 --- a/imblearn/combine/smote_tomek.py +++ b/imblearn/combine/smote_tomek.py @@ -5,10 +5,10 @@ from ..over_sampling import SMOTE from ..under_sampling import TomekLinks -from ..base import SamplerMixin +from ..base import BaseBinaryclassSampler -class SMOTETomek(SamplerMixin): +class SMOTETomek(BaseBinaryclassSampler): """Class to perform over-sampling using SMOTE and cleaning using Tomek links. @@ -103,8 +103,6 @@ class SMOTETomek(SamplerMixin): """ - _estimator_prop = {'handles_multiclass': False} - def __init__(self, ratio='auto', random_state=None, k=5, m=10, out_step=0.5, kind_smote='regular', n_jobs=-1, **kwargs): diff --git a/imblearn/ensemble/balance_cascade.py b/imblearn/ensemble/balance_cascade.py index 89c3e0ad6..7770bae46 100644 --- a/imblearn/ensemble/balance_cascade.py +++ b/imblearn/ensemble/balance_cascade.py @@ -5,14 +5,14 @@ from sklearn.utils import check_random_state -from ..base import SamplerMixin +from ..base import BaseBinaryclassSampler ESTIMATOR_KIND = ('knn', 'decision-tree', 'random-forest', 'adaboost', 'gradient-boosting', 'linear-svm') -class BalanceCascade(SamplerMixin): +class BalanceCascade(BaseBinaryclassSampler): """Create an ensemble of balanced sets by iteratively under-sampling the imbalanced dataset using an estimator. @@ -101,8 +101,6 @@ class BalanceCascade(SamplerMixin): """ - _estimator_prop = {'handles_multiclass': False} - def __init__(self, ratio='auto', return_indices=False, random_state=None, n_max_subset=None, classifier='knn', bootstrap=True, **kwargs): diff --git a/imblearn/ensemble/easy_ensemble.py b/imblearn/ensemble/easy_ensemble.py index a24e34ad5..dddd8aab9 100644 --- a/imblearn/ensemble/easy_ensemble.py +++ b/imblearn/ensemble/easy_ensemble.py @@ -3,11 +3,11 @@ import numpy as np -from ..base import SamplerMixin +from ..base import BaseMulticlassSampler from ..under_sampling import RandomUnderSampler -class EasyEnsemble(SamplerMixin): +class EasyEnsemble(BaseMulticlassSampler): """Create an ensemble sets by iteratively applying random under-sampling. This method iteratively select a random subset and make an ensemble of the @@ -84,8 +84,6 @@ class EasyEnsemble(SamplerMixin): """ - _estimator_prop = {'handles_multiclass': True} - def __init__(self, ratio='auto', return_indices=False, random_state=None, replacement=False, n_subsets=10): super(EasyEnsemble, self).__init__(ratio=ratio) diff --git a/imblearn/over_sampling/adasyn.py b/imblearn/over_sampling/adasyn.py index 9326b3989..24ced028c 100644 --- a/imblearn/over_sampling/adasyn.py +++ b/imblearn/over_sampling/adasyn.py @@ -9,10 +9,10 @@ from sklearn.neighbors import NearestNeighbors from sklearn.utils import check_random_state -from ..base import SamplerMixin +from ..base import BaseBinaryclassSampler -class ADASYN(SamplerMixin): +class ADASYN(BaseBinaryclassSampler): """Perform over-sampling using ADASYN. @@ -86,8 +86,6 @@ class ADASYN(SamplerMixin): """ - _estimator_prop = {'handles_multiclass': False} - def __init__(self, ratio='auto', random_state=None, diff --git a/imblearn/over_sampling/random_over_sampler.py b/imblearn/over_sampling/random_over_sampler.py index 81547d73c..e40490495 100644 --- a/imblearn/over_sampling/random_over_sampler.py +++ b/imblearn/over_sampling/random_over_sampler.py @@ -8,10 +8,10 @@ from sklearn.utils import check_random_state -from ..base import SamplerMixin +from ..base import BaseMulticlassSampler -class RandomOverSampler(SamplerMixin): +class RandomOverSampler(BaseMulticlassSampler): """Class to perform random over-sampling. @@ -70,8 +70,6 @@ class RandomOverSampler(SamplerMixin): """ - _estimator_prop = {'handles_multiclass': True} - def __init__(self, ratio='auto', random_state=None): diff --git a/imblearn/over_sampling/smote.py b/imblearn/over_sampling/smote.py index b34454336..bbfaa6940 100644 --- a/imblearn/over_sampling/smote.py +++ b/imblearn/over_sampling/smote.py @@ -12,13 +12,13 @@ from sklearn.neighbors import NearestNeighbors from sklearn.svm import SVC -from ..base import SamplerMixin +from ..base import BaseBinaryclassSampler SMOTE_KIND = ('regular', 'borderline1', 'borderline2', 'svm') -class SMOTE(SamplerMixin): +class SMOTE(BaseBinaryclassSampler): """Class to perform over-sampling using SMOTE. @@ -109,8 +109,6 @@ class SMOTE(SamplerMixin): """ - _estimator_prop = {'handles_multiclass': False} - def __init__(self, ratio='auto', random_state=None, diff --git a/imblearn/setup.py b/imblearn/setup.py index 9ae8d0771..70f919303 100644 --- a/imblearn/setup.py +++ b/imblearn/setup.py @@ -11,8 +11,6 @@ def configuration(parent_package='', top_path=None): config.add_subpackage('over_sampling/tests') config.add_subpackage('under_sampling') config.add_subpackage('under_sampling/tests') - config.add_subpackage('utils') - config.add_subpackage('utils/tests') config.add_subpackage('tests') diff --git a/imblearn/under_sampling/cluster_centroids.py b/imblearn/under_sampling/cluster_centroids.py index 3757883a3..da973c925 100644 --- a/imblearn/under_sampling/cluster_centroids.py +++ b/imblearn/under_sampling/cluster_centroids.py @@ -10,10 +10,10 @@ from sklearn.cluster import KMeans from sklearn.utils import check_random_state -from ..base import SamplerMixin +from ..base import BaseMulticlassSampler -class ClusterCentroids(SamplerMixin): +class ClusterCentroids(BaseMulticlassSampler): """Perform under-sampling by generating centroids based on clustering methods. @@ -81,8 +81,6 @@ class ClusterCentroids(SamplerMixin): """ - _estimator_prop = {'handles_multiclass': True} - def __init__(self, ratio='auto', random_state=None, n_jobs=-1, **kwargs): super(ClusterCentroids, self).__init__(ratio=ratio) self.random_state = random_state diff --git a/imblearn/under_sampling/condensed_nearest_neighbour.py b/imblearn/under_sampling/condensed_nearest_neighbour.py index 1a8b43b2a..a4d83ee67 100644 --- a/imblearn/under_sampling/condensed_nearest_neighbour.py +++ b/imblearn/under_sampling/condensed_nearest_neighbour.py @@ -10,10 +10,10 @@ from sklearn.utils import check_random_state from sklearn.neighbors import KNeighborsClassifier -from ..base import SamplerMixin +from ..base import BaseMulticlassSampler -class CondensedNearestNeighbour(SamplerMixin): +class CondensedNearestNeighbour(BaseMulticlassSampler): """Class to perform under-sampling based on the condensed nearest neighbour method. @@ -87,8 +87,6 @@ class CondensedNearestNeighbour(SamplerMixin): """ - _estimator_prop = {'handles_multiclass': True} - def __init__(self, return_indices=False, random_state=None, size_ngh=1, n_seeds_S=1, n_jobs=-1, **kwargs): super(CondensedNearestNeighbour, self).__init__() diff --git a/imblearn/under_sampling/edited_nearest_neighbours.py b/imblearn/under_sampling/edited_nearest_neighbours.py index fe16c4d3b..89179d350 100644 --- a/imblearn/under_sampling/edited_nearest_neighbours.py +++ b/imblearn/under_sampling/edited_nearest_neighbours.py @@ -11,13 +11,13 @@ from sklearn.neighbors import NearestNeighbors -from ..base import SamplerMixin +from ..base import BaseMulticlassSampler SEL_KIND = ('all', 'mode') -class EditedNearestNeighbours(SamplerMixin): +class EditedNearestNeighbours(BaseMulticlassSampler): """Class to perform under-sampling based on the edited nearest neighbour method. @@ -94,8 +94,6 @@ class EditedNearestNeighbours(SamplerMixin): """ - _estimator_prop = {'handles_multiclass': True} - def __init__(self, return_indices=False, random_state=None, size_ngh=3, kind_sel='all', n_jobs=-1): super(EditedNearestNeighbours, self).__init__() @@ -203,7 +201,7 @@ def _sample(self, X, y): return X_resampled, y_resampled -class RepeatedEditedNearestNeighbours(SamplerMixin): +class RepeatedEditedNearestNeighbours(BaseMulticlassSampler): """Class to perform under-sampling based on the repeated edited nearest neighbour method. @@ -284,8 +282,6 @@ class RepeatedEditedNearestNeighbours(SamplerMixin): """ - _estimator_prop = {'handles_multiclass': True} - def __init__(self, return_indices=False, random_state=None, size_ngh=3, max_iter=100, kind_sel='all', n_jobs=-1): super(RepeatedEditedNearestNeighbours, self).__init__() @@ -388,7 +384,7 @@ def _sample(self, X, y): return X_resampled, y_resampled -class AllKNN(SamplerMixin): +class AllKNN(BaseMulticlassSampler): """Class to perform under-sampling based on the AllKNN method. Parameters @@ -464,8 +460,6 @@ class AllKNN(SamplerMixin): """ - _estimator_prop = {'handles_multiclass': True} - def __init__(self, return_indices=False, random_state=None, size_ngh=3, kind_sel='all', n_jobs=-1): super(AllKNN, self).__init__() diff --git a/imblearn/under_sampling/instance_hardness_threshold.py b/imblearn/under_sampling/instance_hardness_threshold.py index 9bb4bda43..416d088fc 100644 --- a/imblearn/under_sampling/instance_hardness_threshold.py +++ b/imblearn/under_sampling/instance_hardness_threshold.py @@ -9,14 +9,14 @@ from sklearn.cross_validation import StratifiedKFold -from ..base import SamplerMixin +from ..base import BaseBinaryclassSampler ESTIMATOR_KIND = ('knn', 'decision-tree', 'random-forest', 'adaboost', 'gradient-boosting', 'linear-svm') -class InstanceHardnessThreshold(SamplerMixin): +class InstanceHardnessThreshold(BaseBinaryclassSampler): """Class to perform under-sampling based on the instance hardness threshold. @@ -99,8 +99,6 @@ class InstanceHardnessThreshold(SamplerMixin): """ - _estimator_prop = {'handles_multiclass': False} - def __init__(self, estimator='linear-svm', ratio='auto', return_indices=False, random_state=None, cv=5, n_jobs=-1, **kwargs): diff --git a/imblearn/under_sampling/nearmiss.py b/imblearn/under_sampling/nearmiss.py index a7618554f..0cadc2185 100644 --- a/imblearn/under_sampling/nearmiss.py +++ b/imblearn/under_sampling/nearmiss.py @@ -8,10 +8,10 @@ from sklearn.neighbors import NearestNeighbors -from ..base import SamplerMixin +from ..base import BaseMulticlassSampler -class NearMiss(SamplerMixin): +class NearMiss(BaseMulticlassSampler): """Class to perform under-sampling based on NearMiss methods. Parameters @@ -97,8 +97,6 @@ class NearMiss(SamplerMixin): """ - _estimator_prop = {'handles_multiclass': True} - def __init__(self, ratio='auto', return_indices=False, random_state=None, version=1, size_ngh=3, ver3_samp_ngh=3, n_jobs=-1, **kwargs): super(NearMiss, self).__init__(ratio=ratio) diff --git a/imblearn/under_sampling/neighbourhood_cleaning_rule.py b/imblearn/under_sampling/neighbourhood_cleaning_rule.py index 0b45a1318..e80678e89 100644 --- a/imblearn/under_sampling/neighbourhood_cleaning_rule.py +++ b/imblearn/under_sampling/neighbourhood_cleaning_rule.py @@ -8,10 +8,10 @@ from sklearn.neighbors import NearestNeighbors -from ..base import SamplerMixin +from ..base import BaseMulticlassSampler -class NeighbourhoodCleaningRule(SamplerMixin): +class NeighbourhoodCleaningRule(BaseMulticlassSampler): """Class performing under-sampling based on the neighbourhood cleaning rule. @@ -80,8 +80,6 @@ class NeighbourhoodCleaningRule(SamplerMixin): """ - _estimator_prop = {'handles_multiclass': True} - def __init__(self, return_indices=False, random_state=None, size_ngh=3, n_jobs=-1): super(NeighbourhoodCleaningRule, self).__init__() diff --git a/imblearn/under_sampling/one_sided_selection.py b/imblearn/under_sampling/one_sided_selection.py index 2ceabe97f..bdca42130 100644 --- a/imblearn/under_sampling/one_sided_selection.py +++ b/imblearn/under_sampling/one_sided_selection.py @@ -10,11 +10,11 @@ from sklearn.neighbors import NearestNeighbors from sklearn.utils import check_random_state -from ..base import SamplerMixin +from ..base import BaseBinaryclassSampler from .tomek_links import TomekLinks -class OneSidedSelection(SamplerMixin): +class OneSidedSelection(BaseBinaryclassSampler): """Class to perform under-sampling based on one-sided selection method. Parameters @@ -87,8 +87,6 @@ class OneSidedSelection(SamplerMixin): """ - _estimator_prop = {'handles_multiclass': True} - def __init__(self, return_indices=False, random_state=None, size_ngh=1, n_seeds_S=1, n_jobs=-1, **kwargs): super(OneSidedSelection, self).__init__() diff --git a/imblearn/under_sampling/random_under_sampler.py b/imblearn/under_sampling/random_under_sampler.py index a0a23915a..09ef5fca3 100644 --- a/imblearn/under_sampling/random_under_sampler.py +++ b/imblearn/under_sampling/random_under_sampler.py @@ -8,10 +8,10 @@ from sklearn.utils import check_random_state -from ..base import SamplerMixin +from ..base import BaseMulticlassSampler -class RandomUnderSampler(SamplerMixin): +class RandomUnderSampler(BaseMulticlassSampler): """Class to perform random under-sampling. Under-sample the majority class(es) by randomly picking samples @@ -75,8 +75,6 @@ class RandomUnderSampler(SamplerMixin): """ - _estimator_prop = {'handles_multiclass': True} - def __init__(self, ratio='auto', return_indices=False, random_state=None, replacement=True): super(RandomUnderSampler, self).__init__(ratio=ratio) diff --git a/imblearn/under_sampling/tests/test_one_sided_selection.py b/imblearn/under_sampling/tests/test_one_sided_selection.py index 32b363c87..288d157be 100644 --- a/imblearn/under_sampling/tests/test_one_sided_selection.py +++ b/imblearn/under_sampling/tests/test_one_sided_selection.py @@ -117,29 +117,16 @@ def test_oss_sample_wrong_X(): np.array([0] * 50 + [1] * 50)) -def test_continuous_error(): - """Test either if an error is raised when the target are continuous - type""" +def test_multiclass_error(): + """ Test either if an error is raised when the target are not binary + type. """ # continuous case y = np.linspace(0, 1, 5000) oss = OneSidedSelection(random_state=RND_SEED) assert_warns(UserWarning, oss.fit, X, y) - -def test_multiclass_fit_sample(): - """Test fit sample method with multiclass target""" - - # Make y to be multiclass - y = Y.copy() - y[0:1000] = 2 - - # Resample the data + # multiclass case + y = np.array([0] * 2000 + [1] * 2000 + [2] * 1000) oss = OneSidedSelection(random_state=RND_SEED) - X_resampled, y_resampled = oss.fit_sample(X, y) - - # Check the size of y - count_y_res = Counter(y_resampled) - assert_equal(count_y_res[0], 400) - assert_equal(count_y_res[1], 2410) - assert_equal(count_y_res[2], 715) + assert_warns(UserWarning, oss.fit, X, y) diff --git a/imblearn/under_sampling/tomek_links.py b/imblearn/under_sampling/tomek_links.py index a07b44ce6..fda02c641 100644 --- a/imblearn/under_sampling/tomek_links.py +++ b/imblearn/under_sampling/tomek_links.py @@ -8,10 +8,10 @@ from sklearn.neighbors import NearestNeighbors -from ..base import SamplerMixin +from ..base import BaseBinaryclassSampler -class TomekLinks(SamplerMixin): +class TomekLinks(BaseBinaryclassSampler): """Class to perform under-sampling by removing Tomek's links. Parameters @@ -74,8 +74,6 @@ class TomekLinks(SamplerMixin): """ - _estimator_prop = {'handles_multiclass': False} - def __init__(self, return_indices=False, random_state=None, n_jobs=-1): super(TomekLinks, self).__init__() diff --git a/imblearn/utils/__init__.py b/imblearn/utils/__init__.py deleted file mode 100644 index f47c5235c..000000000 --- a/imblearn/utils/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -""" -The :mod:`imblearn.utils` module includes various utilities. -""" - -from .validation import check_target_type - -__all__ = ['check_target_type'] diff --git a/imblearn/utils/tests/test_validation.py b/imblearn/utils/tests/test_validation.py deleted file mode 100644 index 07f6afe08..000000000 --- a/imblearn/utils/tests/test_validation.py +++ /dev/null @@ -1,67 +0,0 @@ -"""Tests for input validation functions""" - -from collections import Counter - -import numpy as np -from numpy.testing import assert_raises -from numpy.testing import assert_warns -from numpy.testing import assert_equal - -from sklearn.datasets import make_classification -from sklearn.ensemble import AdaBoostClassifier - -from imblearn.under_sampling import RandomUnderSampler -from imblearn.over_sampling import SMOTE -from imblearn.utils import check_target_type - -# Generate a global dataset to use -RND_SEED = 0 -X, Y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], - n_informative=3, n_redundant=1, flip_y=0, - n_features=20, n_clusters_per_class=1, - n_samples=5000, random_state=RND_SEED) - - -def test_check_target_type(): - """Test to check the target type function""" - - # Check that an error is raised when non estimator are passed - assert_raises(TypeError, check_target_type, 'Something', np.ones((100, 1))) - - # Check that an error is raised when an estimator is passed but not a - # sampler - assert_raises(TypeError, check_target_type, AdaBoostClassifier(), - np.ones((100, 1))) - - # Binary sampler case - - # continuous case - y = np.linspace(0, 1, 5000) - sm = SMOTE(random_state=RND_SEED) - assert_warns(UserWarning, sm.fit, X, y) - - # multiclass case - y = np.array([0] * 2000 + [1] * 2000 + [2] * 1000) - sm = SMOTE(random_state=RND_SEED) - assert_warns(UserWarning, sm.fit, X, y) - - # Multiclass sampler case - - # continuous case - y = np.linspace(0, 1, 5000) - rus = RandomUnderSampler(random_state=RND_SEED) - assert_warns(UserWarning, rus.fit, X, y) - - # Make y to be multiclass - y = Y.copy() - y[0:1000] = 2 - - # Resample the data - rus = RandomUnderSampler(random_state=RND_SEED) - X_resampled, y_resampled = rus.fit_sample(X, y) - - # Check the size of y - count_y_res = Counter(y_resampled) - assert_equal(count_y_res[0], 400) - assert_equal(count_y_res[1], 400) - assert_equal(count_y_res[2], 400) diff --git a/imblearn/utils/validation.py b/imblearn/utils/validation.py deleted file mode 100644 index 5a3a638e2..000000000 --- a/imblearn/utils/validation.py +++ /dev/null @@ -1,43 +0,0 @@ -"""Utilities for input validation""" - -import warnings - -from sklearn.utils.multiclass import type_of_target - -def check_target_type(estimator, y): - """Check that the estimators handle the target type provided. - - Checks which type of target is provided and if the estimator can handle - such type. - - Parameters - ---------- - estimator : estimator instance. - Estimator instance for which the check is performed. - - y : ndarray, shape (n_samples, ) - Target vector which need to be checked. - - Returns - ------- - None - - """ - - if not hasattr(estimator, 'fit'): - raise TypeError("%s is not an estimator instance." % (estimator)) - - if not estimator._estimator_type == 'sampler': - raise TypeError("%s is not a sampler instance." % (estimator)) - - # In the case that the estimator should handle multiclass - if estimator.get_properties()['handles_multiclass']: - if not (type_of_target(y) == 'binary' or - type_of_target(y) == 'multiclass'): - warnings.warn('The target type should be binary or multiclass.') - # In the case that the estimator is only handling binary class - else: - if not type_of_target(y) == 'binary': - warnings.warn('The target type should be binary.') - - return None From 012da67ea75e9f9e9730349c010033ce7450b6d4 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 27 Jul 2016 20:23:17 +0200 Subject: [PATCH 23/24] PEP8 --- imblearn/base.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/imblearn/base.py b/imblearn/base.py index 46d435234..4a5578b75 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -19,6 +19,7 @@ from six import string_types + class SamplerMixin(six.with_metaclass(ABCMeta, BaseEstimator)): """Mixin class for samplers with abstact method. @@ -262,7 +263,6 @@ def fit(self, X, y): return self - @abstractmethod def _sample(self, X, y): """Resample the dataset. @@ -321,7 +321,6 @@ def fit(self, X, y): return self - @abstractmethod def _sample(self, X, y): """Resample the dataset. From e9d8f1a5d691ce48e73cfbf7bf749fc915043000 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 27 Jul 2016 23:27:04 +0200 Subject: [PATCH 24/24] Fix the name of the base class --- imblearn/base.py | 48 +------------------ imblearn/combine/smote_enn.py | 4 +- imblearn/combine/smote_tomek.py | 4 +- imblearn/ensemble/balance_cascade.py | 4 +- imblearn/over_sampling/adasyn.py | 4 +- imblearn/over_sampling/smote.py | 4 +- .../instance_hardness_threshold.py | 4 +- .../under_sampling/one_sided_selection.py | 4 +- imblearn/under_sampling/tomek_links.py | 4 +- 9 files changed, 18 insertions(+), 62 deletions(-) diff --git a/imblearn/base.py b/imblearn/base.py index 4a5578b75..4ec8fe11d 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -229,7 +229,7 @@ def __setstate__(self, dict): self.logger = logger -class BaseBinaryclassSampler(six.with_metaclass(ABCMeta, SamplerMixin)): +class BaseBinarySampler(six.with_metaclass(ABCMeta, SamplerMixin)): """Base class for all binary class sampler. Warning: This class should not be used directly. Use derived classes @@ -255,7 +255,7 @@ def fit(self, X, y): """ - super(BaseBinaryclassSampler, self).fit(X, y) + super(BaseBinarySampler, self).fit(X, y) # Check that the target type is binary if not type_of_target(y) == 'binary': @@ -263,28 +263,6 @@ def fit(self, X, y): return self - @abstractmethod - def _sample(self, X, y): - """Resample the dataset. - - Parameters - ---------- - X : ndarray, shape (n_samples, n_features) - Matrix containing the data which have to be sampled. - - y : ndarray, shape (n_samples, ) - Corresponding label for each sample in X. - - Returns - ------- - X_resampled : ndarray, shape (n_samples_new, n_features) - The array containing the resampled data. - - y_resampled : ndarray, shape (n_samples_new) - The corresponding label of `X_resampled` - """ - pass - class BaseMulticlassSampler(six.with_metaclass(ABCMeta, SamplerMixin)): """Base class for all multiclass sampler. @@ -320,25 +298,3 @@ def fit(self, X, y): warnings.warn('The target type should be binary or multiclass.') return self - - @abstractmethod - def _sample(self, X, y): - """Resample the dataset. - - Parameters - ---------- - X : ndarray, shape (n_samples, n_features) - Matrix containing the data which have to be sampled. - - y : ndarray, shape (n_samples, ) - Corresponding label for each sample in X. - - Returns - ------- - X_resampled : ndarray, shape (n_samples_new, n_features) - The array containing the resampled data. - - y_resampled : ndarray, shape (n_samples_new) - The corresponding label of `X_resampled` - """ - pass diff --git a/imblearn/combine/smote_enn.py b/imblearn/combine/smote_enn.py index a72329efa..baa2ceed9 100644 --- a/imblearn/combine/smote_enn.py +++ b/imblearn/combine/smote_enn.py @@ -4,10 +4,10 @@ from ..over_sampling import SMOTE from ..under_sampling import EditedNearestNeighbours -from ..base import BaseBinaryclassSampler +from ..base import BaseBinarySampler -class SMOTEENN(BaseBinaryclassSampler): +class SMOTEENN(BaseBinarySampler): """Class to perform over-sampling using SMOTE and cleaning using ENN. Combine over- and under-sampling using SMOTE and Edited Nearest Neighbours. diff --git a/imblearn/combine/smote_tomek.py b/imblearn/combine/smote_tomek.py index d1061bc3f..96615d3bf 100644 --- a/imblearn/combine/smote_tomek.py +++ b/imblearn/combine/smote_tomek.py @@ -5,10 +5,10 @@ from ..over_sampling import SMOTE from ..under_sampling import TomekLinks -from ..base import BaseBinaryclassSampler +from ..base import BaseBinarySampler -class SMOTETomek(BaseBinaryclassSampler): +class SMOTETomek(BaseBinarySampler): """Class to perform over-sampling using SMOTE and cleaning using Tomek links. diff --git a/imblearn/ensemble/balance_cascade.py b/imblearn/ensemble/balance_cascade.py index 7770bae46..b0b1a0b62 100644 --- a/imblearn/ensemble/balance_cascade.py +++ b/imblearn/ensemble/balance_cascade.py @@ -5,14 +5,14 @@ from sklearn.utils import check_random_state -from ..base import BaseBinaryclassSampler +from ..base import BaseBinarySampler ESTIMATOR_KIND = ('knn', 'decision-tree', 'random-forest', 'adaboost', 'gradient-boosting', 'linear-svm') -class BalanceCascade(BaseBinaryclassSampler): +class BalanceCascade(BaseBinarySampler): """Create an ensemble of balanced sets by iteratively under-sampling the imbalanced dataset using an estimator. diff --git a/imblearn/over_sampling/adasyn.py b/imblearn/over_sampling/adasyn.py index 24ced028c..d55c22e7e 100644 --- a/imblearn/over_sampling/adasyn.py +++ b/imblearn/over_sampling/adasyn.py @@ -9,10 +9,10 @@ from sklearn.neighbors import NearestNeighbors from sklearn.utils import check_random_state -from ..base import BaseBinaryclassSampler +from ..base import BaseBinarySampler -class ADASYN(BaseBinaryclassSampler): +class ADASYN(BaseBinarySampler): """Perform over-sampling using ADASYN. diff --git a/imblearn/over_sampling/smote.py b/imblearn/over_sampling/smote.py index bbfaa6940..7b0736d77 100644 --- a/imblearn/over_sampling/smote.py +++ b/imblearn/over_sampling/smote.py @@ -12,13 +12,13 @@ from sklearn.neighbors import NearestNeighbors from sklearn.svm import SVC -from ..base import BaseBinaryclassSampler +from ..base import BaseBinarySampler SMOTE_KIND = ('regular', 'borderline1', 'borderline2', 'svm') -class SMOTE(BaseBinaryclassSampler): +class SMOTE(BaseBinarySampler): """Class to perform over-sampling using SMOTE. diff --git a/imblearn/under_sampling/instance_hardness_threshold.py b/imblearn/under_sampling/instance_hardness_threshold.py index 416d088fc..38e33666a 100644 --- a/imblearn/under_sampling/instance_hardness_threshold.py +++ b/imblearn/under_sampling/instance_hardness_threshold.py @@ -9,14 +9,14 @@ from sklearn.cross_validation import StratifiedKFold -from ..base import BaseBinaryclassSampler +from ..base import BaseBinarySampler ESTIMATOR_KIND = ('knn', 'decision-tree', 'random-forest', 'adaboost', 'gradient-boosting', 'linear-svm') -class InstanceHardnessThreshold(BaseBinaryclassSampler): +class InstanceHardnessThreshold(BaseBinarySampler): """Class to perform under-sampling based on the instance hardness threshold. diff --git a/imblearn/under_sampling/one_sided_selection.py b/imblearn/under_sampling/one_sided_selection.py index bdca42130..ddb52ebb1 100644 --- a/imblearn/under_sampling/one_sided_selection.py +++ b/imblearn/under_sampling/one_sided_selection.py @@ -10,11 +10,11 @@ from sklearn.neighbors import NearestNeighbors from sklearn.utils import check_random_state -from ..base import BaseBinaryclassSampler +from ..base import BaseBinarySampler from .tomek_links import TomekLinks -class OneSidedSelection(BaseBinaryclassSampler): +class OneSidedSelection(BaseBinarySampler): """Class to perform under-sampling based on one-sided selection method. Parameters diff --git a/imblearn/under_sampling/tomek_links.py b/imblearn/under_sampling/tomek_links.py index fda02c641..86700c6ec 100644 --- a/imblearn/under_sampling/tomek_links.py +++ b/imblearn/under_sampling/tomek_links.py @@ -8,10 +8,10 @@ from sklearn.neighbors import NearestNeighbors -from ..base import BaseBinaryclassSampler +from ..base import BaseBinarySampler -class TomekLinks(BaseBinaryclassSampler): +class TomekLinks(BaseBinarySampler): """Class to perform under-sampling by removing Tomek's links. Parameters