scikit-learn-contrib · chkoar · Mar 19, 2017 · Mar 13, 2017 · Mar 15, 2017 · Mar 16, 2017
diff --git a/doc/api.rst b/doc/api.rst
@@ -158,3 +158,20 @@ Functions
    :toctree: generated/
 
    datasets.make_imbalance
+
+
+Utilities
+=========
+
+.. automodule:: imblearn.utils
+    :no-members:
+    :no-inherited-members:
+
+.. currentmodule:: imblearn
+
+Functions
+---------
+.. autosummary::
+   :toctree: generated/
+
+   utils.estimator_checks.check_estimator
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -4,6 +4,36 @@
 Release history
 ===============
 
+.. _changes_0_3:
+
+Changelog
+---------
+
+New features
+~~~~~~~~~~~~
+
+- Turn off steps in :class:`pipeline.Pipeline` using the `None`
+  object. By `Christos Aridas`_.
+
+Enhancement
+~~~~~~~~~~~
+
+- All the unit tests have been factorized and a `check_estimators` has
+  been derived from scikit-learn. By `Guillaume Lemaitre`_.
+- Script for automatic build of conda packages and uploading. By
+  `Guillaume Lemaitre`_
+
+API changes summary
+~~~~~~~~~~~~~~~~~~~
+
+- `__init__` has been removed from the :class:`base.SamplerMixin` to
+  create a real mixin class. By `Guillaume Lemaitre`_.
+- creation of a module `exceptions` to handle consistant raising of
+  errors. By `Guillaume Lemaitre`_.
+- creation of a module `utils.validation` to make checking of
+  recurrent patterns. By `Guillaume Lemaitre`_.
+
+
 .. _changes_0_2:
 
 Version 0.2
@@ -32,7 +62,6 @@ New features
 
 - Added AllKNN under sampling technique. By `Dayvid Oliveira`_.
 - Added a module `metrics` implementing some specific scoring function for the problem of balancing. By `Guillaume Lemaitre`_ and `Christos Aridas`_.
-- Turn off steps in :class:`pipeline.Pipeline` using the `None` object. By `Christos Aridas`_.
 
 Enhancement
 ~~~~~~~~~~~
@@ -42,12 +71,6 @@ Enhancement
 - Change from `cross_validation` module to `model_selection` module for
   `sklearn` deprecation cycle. By `Dayvid Oliveira`_ and `Christos Aridas`_.
 
-New features
-~~~~~~~~~~~~
-
-- Added AllKNN under sampling technique.
-- Added support for bumpversion.
-
 API changes summary
 ~~~~~~~~~~~~~~~~~~~
 

diff --git a/imblearn/__init__.py b/imblearn/__init__.py
@@ -10,13 +10,18 @@
 ensemble
     Module which provides methods generating an ensemble of
     under-sampled subsets.
+exceptions
+    Module including custom warnings and error clases used across
+    imbalanced-learn.
 metrics
     Module which provides metrics to quantified the classification performance
     with imbalanced dataset.
 over_sampling
     Module which provides methods to under-sample a dataset.
 under-sampling
     Module which provides methods to over-sample a dataset.
+utils
+    Module including various utilities.
 pipeline
     Module which allowing to create pipeline with scikit-learn estimators.
 """
@@ -34,6 +39,6 @@
 
 # list all submodules available in imblearn and version
 __all__ = [
-    'combine', 'ensemble', 'metrics', 'over_sampling', 'under_sampling',
-    'pipeline', '__version__'
+    'combine', 'ensemble', 'exceptions', 'metrics', 'over_sampling',
+    'under_sampling', 'utils', 'pipeline', '__version__'
 ]
diff --git a/imblearn/base.py b/imblearn/base.py
@@ -1,9 +1,10 @@
 """Base class for sampling"""
 
-from __future__ import division, print_function
+from __future__ import division
 
 import logging
 import warnings
+from numbers import Real
 from abc import ABCMeta, abstractmethod
 from collections import Counter
 
@@ -12,6 +13,7 @@
 from sklearn.externals import six
 from sklearn.utils import check_X_y
 from sklearn.utils.multiclass import type_of_target
+from sklearn.utils.validation import check_is_fitted
 
 
 class SamplerMixin(six.with_metaclass(ABCMeta, BaseEstimator)):
@@ -62,16 +64,9 @@ def fit(self, X, y):
         self.logger.info('Compute classes statistics ...')
 
         # Raise an error if there is only one class
-        # if uniques.size == 1:
-        #     raise RuntimeError("Only one class detected, aborting...")
-        # Raise a warning for the moment to be compatible with BaseEstimator
-        self.logger.debug('The number of classes is %s', np.unique(y).size)
-        self.logger.debug('Shall we raise a warning: %s',
-                          np.unique(y).size == 1)
-        if np.unique(y).size == 1:
-            warnings.simplefilter('always', UserWarning)
-            warnings.warn('Only one class detected, something will get wrong')
-            self.logger.debug('The warning should has been raised.')
+        if np.unique(y).size <= 1:
+            raise ValueError("Sampler can't balance when only one class is"
+                             " present.")
 
         # Store the size of X to check at sampling time if we have the
         # same data
@@ -88,12 +83,16 @@ def fit(self, X, y):
                          np.unique(y).size, self.stats_c_)
 
         # Check if the ratio provided at initialisation make sense
-        if isinstance(self.ratio, float):
+        if isinstance(self.ratio, Real):
             if self.ratio < (self.stats_c_[self.min_c_] /
                              self.stats_c_[self.maj_c_]):
                 raise RuntimeError('The ratio requested at initialisation'
                                    ' should be greater or equal than the'
-                                   ' balancing ratio of the current data.')
+                                   ' balancing ratio of the current data.'
+                                   ' Got {} < {}.'.format(
+                                       self.ratio,
+                                       self.stats_c_[self.min_c_] /
+                                       self.stats_c_[self.maj_c_]))
 
         return self
 
@@ -122,14 +121,14 @@ def sample(self, X, y):
         X, y = check_X_y(X, y)
 
         # Check that the data have been fitted
-        if not hasattr(self, 'stats_c_'):
-            raise RuntimeError('You need to fit the data, first!!!')
+        check_is_fitted(self, 'stats_c_')
 
         # Check if the size of the data is identical than at fitting
         if X.shape != self.X_shape_:
             raise RuntimeError('The data that you attempt to resample do not'
                                ' seem to be the one earlier fitted. Use the'
-                               ' fitted data.')
+                               ' fitted data. Shape of data is {}, got {}'
+                               ' instead.'.format(X.shape, self.X_shape_))
 
         if hasattr(self, 'ratio'):
             self._validate_ratio()
@@ -170,17 +169,23 @@ def _validate_ratio(self):
         # The ratio correspond to the number of samples in the minority class
         # over the number of samples in the majority class. Thus, the ratio
         # cannot be greater than 1.0
-        if isinstance(self.ratio, float):
+        if isinstance(self.ratio, Real):
             if self.ratio > 1:
-                raise ValueError('Ration cannot be greater than one.')
+                raise ValueError('Ratio cannot be greater than one.'
+                                 ' Got {}.'.format(self.ratio))
             elif self.ratio <= 0:
-                raise ValueError('Ratio cannot be negative.')
+                raise ValueError('Ratio cannot be negative.'
+                                 ' Got {}.'.format(self.ratio))
 
         elif isinstance(self.ratio, six.string_types):
             if self.ratio != 'auto':
-                raise ValueError('Unknown string for the parameter ratio.')
+                raise ValueError("Unknown string for the parameter ratio."
+                                 " Got {} instead of 'auto'".format(
+                                     self.ratio))
         else:
-            raise ValueError('Unknown parameter type for ratio.')
+            raise ValueError('Unknown parameter type for ratio.'
+                             ' Got {} instead of float or str'.format(
+                                 type(self.ratio)))
 
     def _validate_size_ngh_deprecation(self):
         "Private function to warn about the deprecation about size_ngh."

diff --git a/imblearn/combine/smote_enn.py b/imblearn/combine/smote_enn.py
@@ -202,7 +202,8 @@ def _validate_estimator(self):
             if isinstance(self.smote, SMOTE):
                 self.smote_ = self.smote
             else:
-                raise ValueError('smote needs to be a SMOTE object.')
+                raise ValueError('smote needs to be a SMOTE object.'
+                                 'Got {} instead.'.format(type(self.smote)))
         # Otherwise create a default SMOTE
         else:
             self.smote_ = SMOTE(
@@ -234,7 +235,8 @@ def _validate_estimator(self):
             if isinstance(self.enn, EditedNearestNeighbours):
                 self.enn_ = self.enn
             else:
-                raise ValueError('enn needs to be an EditedNearestNeighbours.')
+                raise ValueError('enn needs to be an EditedNearestNeighbours.'
+                                 ' Got {} instead.'.format(type(self.enn)))
         # Otherwise create a default EditedNearestNeighbours
         else:
             self.enn_ = EditedNearestNeighbours(random_state=self.random_state)

diff --git a/imblearn/combine/smote_tomek.py b/imblearn/combine/smote_tomek.py
@@ -173,7 +173,8 @@ def _validate_estimator(self):
             if isinstance(self.smote, SMOTE):
                 self.smote_ = self.smote
             else:
-                raise ValueError('smote needs to be a SMOTE object.')
+                raise ValueError('smote needs to be a SMOTE object.'
+                                 'Got {} instead.'.format(type(self.smote)))
         # Otherwise create a default SMOTE
         else:
             self.smote_ = SMOTE(
@@ -192,8 +193,9 @@ def _validate_estimator(self):
             if isinstance(self.tomek, TomekLinks):
                 self.tomek_ = self.tomek
             else:
-                raise ValueError('tomek needs to be a TomekLinks object.')
-        # Otherwise create a default EditedNearestNeighbours
+                raise ValueError('tomek needs to be a TomekLinks object.'
+                                 'Got {} instead.'.format(type(self.tomek)))
+        # Otherwise create a default TomekLinks
         else:
             self.tomek_ = TomekLinks(random_state=self.random_state)