diff --git a/doc/over_sampling.rst b/doc/over_sampling.rst index b52de3436..58f0c2d58 100644 --- a/doc/over_sampling.rst +++ b/doc/over_sampling.rst @@ -230,6 +230,9 @@ something specific for the categorical features. In fact, the categories of a new generated sample are decided by picking the most frequent category of the nearest neighbors present during the generation. +.. warning:: + Be aware that SMOTE-NC is not designed to work with only categorical data. + The other SMOTE variants and ADASYN differ from each other by selecting the samples :math:`x_i` ahead of generating the new samples. diff --git a/doc/whats_new/v0.7.rst b/doc/whats_new/v0.7.rst index 804c884d0..6018cc5c4 100644 --- a/doc/whats_new/v0.7.rst +++ b/doc/whats_new/v0.7.rst @@ -38,6 +38,10 @@ Bug fixes unusable. :pr:`710` by :user:`Guillaume Lemaitre `. +- Raise a proper error message when only numerical or categorical features + are given in :class:`imblearn.over_sampling.SMOTENC`. + :pr:`720` by :user:`Guillaume Lemaitre `. + Enhancements ............ diff --git a/imblearn/over_sampling/_smote.py b/imblearn/over_sampling/_smote.py index e9007a472..a141d4b04 100644 --- a/imblearn/over_sampling/_smote.py +++ b/imblearn/over_sampling/_smote.py @@ -20,7 +20,6 @@ from sklearn.utils import check_random_state from sklearn.utils import _safe_indexing from sklearn.utils import check_array -from sklearn.utils import check_X_y from sklearn.utils.sparsefuncs_fast import csr_mean_variance_axis0 from sklearn.utils.sparsefuncs_fast import csc_mean_variance_axis0 @@ -747,6 +746,7 @@ class SMOTENC(SMOTE): """Synthetic Minority Over-sampling Technique for Nominal and Continuous. Unlike :class:`SMOTE`, SMOTE-NC for dataset containing continuous and + categorical features. However, it is not designed to work with only categorical features. Read more in the :ref:`User Guide `. @@ -893,7 +893,9 @@ def _check_X_y(self, X, y): features. """ y, binarize_y = check_target_type(y, indicate_one_vs_all=True) - X, y = check_X_y(X, y, accept_sparse=["csr", "csc"], dtype=None) + X, y = self._validate_data( + X, y, reset=True, dtype=None, accept_sparse=["csr", "csc"] + ) return X, y, binarize_y def _validate_estimator(self): @@ -917,6 +919,12 @@ def _validate_estimator(self): np.arange(self.n_features_), self.categorical_features_ ) + if self.categorical_features_.size == self.n_features_in_: + raise ValueError( + "SMOTE-NC is not designed to work only with categorical " + "features. It requires some numerical features." + ) + def _fit_resample(self, X, y): self.n_features_ = X.shape[1] self._validate_estimator() diff --git a/imblearn/over_sampling/tests/test_smote_nc.py b/imblearn/over_sampling/tests/test_smote_nc.py index c7ba80caa..4d8bddf50 100644 --- a/imblearn/over_sampling/tests/test_smote_nc.py +++ b/imblearn/over_sampling/tests/test_smote_nc.py @@ -204,3 +204,17 @@ def test_smotenc_preserve_dtype(): X_res, y_res = smote.fit_resample(X, y) assert X.dtype == X_res.dtype, "X dtype is not preserved" assert y.dtype == y_res.dtype, "y dtype is not preserved" + + +@pytest.mark.parametrize( + "categorical_features", [[True, True, True], [0, 1, 2]] +) +def test_smotenc_raising_error_all_categorical(categorical_features): + X, y = make_classification( + n_features=3, n_informative=1, n_redundant=1, n_repeated=0, + n_clusters_per_class=1, + ) + smote = SMOTENC(categorical_features=categorical_features) + err_msg = "SMOTE-NC is not designed to work only with categorical features" + with pytest.raises(ValueError, match=err_msg): + smote.fit_resample(X, y)