scikit-learn-contrib · glemaitre · Jun 9, 2020 · Jun 9, 2020 · Jun 9, 2020 · Jun 9, 2020
diff --git a/doc/over_sampling.rst b/doc/over_sampling.rst
@@ -230,6 +230,9 @@ something specific for the categorical features. In fact, the categories of a
 new generated sample are decided by picking the most frequent category of the
 nearest neighbors present during the generation.
 
+.. warning::
+   Be aware that SMOTE-NC is not designed to work with only categorical data.
+
 The other SMOTE variants and ADASYN differ from each other by selecting the
 samples :math:`x_i` ahead of generating the new samples.
 

diff --git a/doc/whats_new/v0.7.rst b/doc/whats_new/v0.7.rst
@@ -38,6 +38,10 @@ Bug fixes
   unusable.
   :pr:`710` by :user:`Guillaume Lemaitre <glemaitre>`.
 
+- Raise a proper error message when only numerical or categorical features
+  are given in :class:`imblearn.over_sampling.SMOTENC`.
+  :pr:`720` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 Enhancements
 ............
 

diff --git a/imblearn/over_sampling/_smote.py b/imblearn/over_sampling/_smote.py
@@ -20,7 +20,6 @@
 from sklearn.utils import check_random_state
 from sklearn.utils import _safe_indexing
 from sklearn.utils import check_array
-from sklearn.utils import check_X_y
 from sklearn.utils.sparsefuncs_fast import csr_mean_variance_axis0
 from sklearn.utils.sparsefuncs_fast import csc_mean_variance_axis0
 
@@ -747,6 +746,7 @@ class SMOTENC(SMOTE):
     """Synthetic Minority Over-sampling Technique for Nominal and Continuous.
 
     Unlike :class:`SMOTE`, SMOTE-NC for dataset containing continuous and
+    categorical features. However, it is not designed to work with only
     categorical features.
 
     Read more in the :ref:`User Guide <smote_adasyn>`.
@@ -893,7 +893,9 @@ def _check_X_y(self, X, y):
         features.
         """
         y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
-        X, y = check_X_y(X, y, accept_sparse=["csr", "csc"], dtype=None)
+        X, y = self._validate_data(
+            X, y, reset=True, dtype=None, accept_sparse=["csr", "csc"]
+        )
         return X, y, binarize_y
 
     def _validate_estimator(self):
@@ -917,6 +919,12 @@ def _validate_estimator(self):
             np.arange(self.n_features_), self.categorical_features_
         )
 
+        if self.categorical_features_.size == self.n_features_in_:
+            raise ValueError(
+                "SMOTE-NC is not designed to work only with categorical "
+                "features. It requires some numerical features."
+            )
+
     def _fit_resample(self, X, y):
         self.n_features_ = X.shape[1]
         self._validate_estimator()

diff --git a/imblearn/over_sampling/tests/test_smote_nc.py b/imblearn/over_sampling/tests/test_smote_nc.py
@@ -204,3 +204,17 @@ def test_smotenc_preserve_dtype():
     X_res, y_res = smote.fit_resample(X, y)
     assert X.dtype == X_res.dtype, "X dtype is not preserved"
     assert y.dtype == y_res.dtype, "y dtype is not preserved"
+
+
+@pytest.mark.parametrize(
+    "categorical_features", [[True, True, True], [0, 1, 2]]
+)
+def test_smotenc_raising_error_all_categorical(categorical_features):
+    X, y = make_classification(
+        n_features=3, n_informative=1, n_redundant=1, n_repeated=0,
+        n_clusters_per_class=1,
+    )
+    smote = SMOTENC(categorical_features=categorical_features)
+    err_msg = "SMOTE-NC is not designed to work only with categorical features"
+    with pytest.raises(ValueError, match=err_msg):
+        smote.fit_resample(X, y)