scikit-learn-contrib · glemaitre · Jun 11, 2019 · Jun 7, 2019 · Jun 11, 2019
diff --git a/imblearn/over_sampling/_random_over_sampler.py b/imblearn/over_sampling/_random_over_sampler.py
@@ -119,3 +119,9 @@ def _fit_resample(self, X, y):
                     safe_indexing(y, sample_indices), sample_indices)
         return (safe_indexing(X, sample_indices),
                 safe_indexing(y, sample_indices))
+
+    def _more_tags(self):
+        # TODO: remove the str tag once the following PR is merged:
+        # https://github.com/scikit-learn/scikit-learn/pull/14043
+        return {'X_types': ['2darray', 'str', 'string'],
+                'sample_indices': True}
diff --git a/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py b/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py
@@ -170,3 +170,6 @@ def _fit_resample(self, X, y):
         y_resampled = np.hstack(y_resampled)
 
         return X_resampled, np.array(y_resampled, dtype=y.dtype)
+
+    def _more_tags(self):
+        return {'sample_indices': False}
diff --git a/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py b/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py
@@ -220,3 +220,6 @@ def _fit_resample(self, X, y):
             return (safe_indexing(X, idx_under), safe_indexing(y, idx_under),
                     idx_under)
         return safe_indexing(X, idx_under), safe_indexing(y, idx_under)
+
+    def _more_tags(self):
+        return {'sample_indices': True}
diff --git a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py
@@ -186,6 +186,9 @@ def _fit_resample(self, X, y):
                     idx_under)
         return safe_indexing(X, idx_under), safe_indexing(y, idx_under)
 
+    def _more_tags(self):
+        return {'sample_indices': True}
+
 
 @Substitution(
     sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring,
@@ -377,6 +380,9 @@ def _fit_resample(self, X, y):
             return X_resampled, y_resampled, self.sample_indices_
         return X_resampled, y_resampled
 
+    def _more_tags(self):
+        return {'sample_indices': True}
+
 
 @Substitution(
     sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring,
@@ -564,3 +570,6 @@ def _fit_resample(self, X, y):
         if self.return_indices:
             return X_resampled, y_resampled, self.sample_indices_
         return X_resampled, y_resampled
+
+    def _more_tags(self):
+        return {'sample_indices': True}
diff --git a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py
@@ -187,3 +187,6 @@ def _fit_resample(self, X, y):
             return (safe_indexing(X, idx_under), safe_indexing(y, idx_under),
                     idx_under)
         return safe_indexing(X, idx_under), safe_indexing(y, idx_under)
+
+    def _more_tags(self):
+        return {'sample_indices': True}
diff --git a/imblearn/under_sampling/_prototype_selection/_nearmiss.py b/imblearn/under_sampling/_prototype_selection/_nearmiss.py
@@ -293,3 +293,6 @@ def _fit_resample(self, X, y):
             return (safe_indexing(X, idx_under), safe_indexing(y, idx_under),
                     idx_under)
         return safe_indexing(X, idx_under), safe_indexing(y, idx_under)
+
+    def _more_tags(self):
+        return {'sample_indices': True}
diff --git a/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py b/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py
@@ -204,3 +204,6 @@ def _fit_resample(self, X, y):
                     self.sample_indices_)
         return (safe_indexing(X, self.sample_indices_),
                 safe_indexing(y, self.sample_indices_))
+
+    def _more_tags(self):
+        return {'sample_indices': True}
diff --git a/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py b/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py
@@ -189,3 +189,6 @@ def _fit_resample(self, X, y):
         if self.return_indices:
             return (X_cleaned, y_cleaned, self.sample_indices_)
         return X_cleaned, y_cleaned
+
+    def _more_tags(self):
+        return {'sample_indices': True}
diff --git a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py
@@ -135,3 +135,9 @@ def _fit_resample(self, X, y):
             return (safe_indexing(X, idx_under), safe_indexing(y, idx_under),
                     idx_under)
         return safe_indexing(X, idx_under), safe_indexing(y, idx_under)
+
+    def _more_tags(self):
+        # TODO: remove the str tag once the following PR is merged:
+        # https://github.com/scikit-learn/scikit-learn/pull/14043
+        return {'X_types': ['2darray', 'str', 'string'],
+                'sample_indices': True}
diff --git a/imblearn/under_sampling/_prototype_selection/_tomek_links.py b/imblearn/under_sampling/_prototype_selection/_tomek_links.py
@@ -166,3 +166,6 @@ def _fit_resample(self, X, y):
                     self.sample_indices_)
         return (safe_indexing(X, self.sample_indices_),
                 safe_indexing(y, self.sample_indices_))
+
+    def _more_tags(self):
+        return {'sample_indices': True}
diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py
@@ -34,40 +34,12 @@
 from imblearn.under_sampling import NearMiss, ClusterCentroids
 
 DONT_SUPPORT_RATIO = ['SVMSMOTE', 'BorderlineSMOTE']
-SUPPORT_STRING = ['RandomUnderSampler', 'RandomOverSampler']
-HAVE_SAMPLE_INDICES = [
-    'RandomOverSampler', 'RandomUnderSampler', 'InstanceHardnessThreshold',
-    'NearMiss', 'TomekLinks', 'EditedNearestNeighbours',
-    'RepeatedEditedNearestNeighbours', 'AllKNN', 'OneSidedSelection',
-    'CondensedNearestNeighbour', 'NeighbourhoodCleaningRule']
 # FIXME: remove in 0.6
 DONT_HAVE_RANDOM_STATE = ('NearMiss', 'EditedNearestNeighbours',
                           'RepeatedEditedNearestNeighbours', 'AllKNN',
                           'NeighbourhoodCleaningRule', 'TomekLinks')
 
 
-def monkey_patch_check_dtype_object(name, estimator_orig):
-    # check that estimators treat dtype object as numeric if possible
-    rng = np.random.RandomState(0)
-    X = rng.rand(40, 10).astype(object)
-    y = np.array([0] * 10 + [1] * 30, dtype=np.int)
-    estimator = clone(estimator_orig)
-    estimator.fit(X, y)
-
-    try:
-        estimator.fit(X, y.astype(object))
-    except Exception as e:
-        if "Unknown label type" not in str(e):
-            raise
-
-    if name not in SUPPORT_STRING:
-        X[0, 0] = {'foo': 'bar'}
-        msg = "argument must be a string or a number"
-        assert_raises_regex(TypeError, msg, estimator.fit, X, y)
-    else:
-        estimator.fit(X, y)
-
-
 def _yield_sampler_checks(name, Estimator):
     yield check_target_type
     yield check_samplers_one_label
@@ -106,10 +78,6 @@ def check_estimator(Estimator, run_sampler_tests=True):
         Will run or not the samplers tests.
     """
     name = Estimator.__name__
-    # monkey patch check_dtype_object for the sampler allowing strings
-    import sklearn.utils.estimator_checks
-    sklearn.utils.estimator_checks.check_dtype_object = \
-        monkey_patch_check_dtype_object
     # scikit-learn common tests
     sklearn_check_estimator(Estimator)
     check_parameters_default_constructible(name, Estimator)
@@ -369,7 +337,8 @@ def check_samplers_sample_indices(name, Sampler):
                                weights=[0.2, 0.3, 0.5], random_state=0)
     sampler = Sampler()
     sampler.fit_resample(X, y)
-    if name in HAVE_SAMPLE_INDICES:
-        assert hasattr(sampler, 'sample_indices_')
+    sample_indices = sampler._get_tags().get('sample_indices', None)
+    if sample_indices:
+        assert hasattr(sampler, 'sample_indices_') is sample_indices
     else:
         assert not hasattr(sampler, 'sample_indices_')