From 4961a585b36708e558912dddc941c1551e431f1b Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 9 Jun 2020 09:44:48 +0200 Subject: [PATCH 1/3] FIX raise proper error message when only categorical passed to SMOTE-NC --- doc/over_sampling.rst | 3 +++ doc/whats_new/v0.7.rst | 4 ++++ imblearn/over_sampling/_smote.py | 11 ++++++++++- imblearn/over_sampling/tests/test_smote_nc.py | 14 ++++++++++++++ 4 files changed, 31 insertions(+), 1 deletion(-) diff --git a/doc/over_sampling.rst b/doc/over_sampling.rst index b52de3436..58f0c2d58 100644 --- a/doc/over_sampling.rst +++ b/doc/over_sampling.rst @@ -230,6 +230,9 @@ something specific for the categorical features. In fact, the categories of a new generated sample are decided by picking the most frequent category of the nearest neighbors present during the generation. +.. warning:: + Be aware that SMOTE-NC is not designed to work with only categorical data. + The other SMOTE variants and ADASYN differ from each other by selecting the samples :math:`x_i` ahead of generating the new samples. diff --git a/doc/whats_new/v0.7.rst b/doc/whats_new/v0.7.rst index 804c884d0..fe114d48d 100644 --- a/doc/whats_new/v0.7.rst +++ b/doc/whats_new/v0.7.rst @@ -38,6 +38,10 @@ Bug fixes unusable. :pr:`710` by :user:`Guillaume Lemaitre `. +- Raise a proper error message when only numerical or categorical features + are given in :class:`imblearn.over_sampling.SMOTENC`. + :pr:`xxx` by :user:`Guillaume Lemaitre `. + Enhancements ............ diff --git a/imblearn/over_sampling/_smote.py b/imblearn/over_sampling/_smote.py index e9007a472..abe31c2fe 100644 --- a/imblearn/over_sampling/_smote.py +++ b/imblearn/over_sampling/_smote.py @@ -747,6 +747,7 @@ class SMOTENC(SMOTE): """Synthetic Minority Over-sampling Technique for Nominal and Continuous. Unlike :class:`SMOTE`, SMOTE-NC for dataset containing continuous and + categorical features. However, it is not designed to work with only categorical features. Read more in the :ref:`User Guide `. @@ -893,7 +894,9 @@ def _check_X_y(self, X, y): features. """ y, binarize_y = check_target_type(y, indicate_one_vs_all=True) - X, y = check_X_y(X, y, accept_sparse=["csr", "csc"], dtype=None) + X, y = self._validate_data( + X, y, reset=True, dtype=None, accept_sparse=["csr", "csc"] + ) return X, y, binarize_y def _validate_estimator(self): @@ -917,6 +920,12 @@ def _validate_estimator(self): np.arange(self.n_features_), self.categorical_features_ ) + if self.categorical_features_.size == self.n_features_in_: + raise ValueError( + "SMOTE-NC is not designed to work only with categorical " + "features. It requires some numerical features." + ) + def _fit_resample(self, X, y): self.n_features_ = X.shape[1] self._validate_estimator() diff --git a/imblearn/over_sampling/tests/test_smote_nc.py b/imblearn/over_sampling/tests/test_smote_nc.py index c7ba80caa..4d8bddf50 100644 --- a/imblearn/over_sampling/tests/test_smote_nc.py +++ b/imblearn/over_sampling/tests/test_smote_nc.py @@ -204,3 +204,17 @@ def test_smotenc_preserve_dtype(): X_res, y_res = smote.fit_resample(X, y) assert X.dtype == X_res.dtype, "X dtype is not preserved" assert y.dtype == y_res.dtype, "y dtype is not preserved" + + +@pytest.mark.parametrize( + "categorical_features", [[True, True, True], [0, 1, 2]] +) +def test_smotenc_raising_error_all_categorical(categorical_features): + X, y = make_classification( + n_features=3, n_informative=1, n_redundant=1, n_repeated=0, + n_clusters_per_class=1, + ) + smote = SMOTENC(categorical_features=categorical_features) + err_msg = "SMOTE-NC is not designed to work only with categorical features" + with pytest.raises(ValueError, match=err_msg): + smote.fit_resample(X, y) From ecc33d07bf4660af9b9ee3859eb3f1553593bd4a Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 9 Jun 2020 09:49:46 +0200 Subject: [PATCH 2/3] update whats new --- doc/whats_new/v0.7.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v0.7.rst b/doc/whats_new/v0.7.rst index fe114d48d..6018cc5c4 100644 --- a/doc/whats_new/v0.7.rst +++ b/doc/whats_new/v0.7.rst @@ -40,7 +40,7 @@ Bug fixes - Raise a proper error message when only numerical or categorical features are given in :class:`imblearn.over_sampling.SMOTENC`. - :pr:`xxx` by :user:`Guillaume Lemaitre `. + :pr:`720` by :user:`Guillaume Lemaitre `. Enhancements ............ From f1355e29ba78d43160465d5556f9cd14d854fde2 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 9 Jun 2020 10:21:56 +0200 Subject: [PATCH 3/3] remove unused import --- imblearn/over_sampling/_smote.py | 1 - 1 file changed, 1 deletion(-) diff --git a/imblearn/over_sampling/_smote.py b/imblearn/over_sampling/_smote.py index abe31c2fe..a141d4b04 100644 --- a/imblearn/over_sampling/_smote.py +++ b/imblearn/over_sampling/_smote.py @@ -20,7 +20,6 @@ from sklearn.utils import check_random_state from sklearn.utils import _safe_indexing from sklearn.utils import check_array -from sklearn.utils import check_X_y from sklearn.utils.sparsefuncs_fast import csr_mean_variance_axis0 from sklearn.utils.sparsefuncs_fast import csc_mean_variance_axis0