Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] Tests refactoring #242

Merged
merged 14 commits into from
Mar 19, 2017
17 changes: 17 additions & 0 deletions doc/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -158,3 +158,20 @@ Functions
:toctree: generated/

datasets.make_imbalance


Utilities
=========

.. automodule:: imblearn.utils
:no-members:
:no-inherited-members:

.. currentmodule:: imblearn

Functions
---------
.. autosummary::
:toctree: generated/

utils.estimator_checks.check_estimator
37 changes: 30 additions & 7 deletions doc/whats_new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,36 @@
Release history
===============

.. _changes_0_3:

Changelog
---------

New features
~~~~~~~~~~~~

- Turn off steps in :class:`pipeline.Pipeline` using the `None`
object. By `Christos Aridas`_.

Enhancement
~~~~~~~~~~~

- All the unit tests have been factorized and a `check_estimators` has
been derived from scikit-learn. By `Guillaume Lemaitre`_.
- Script for automatic build of conda packages and uploading. By
`Guillaume Lemaitre`_

API changes summary
~~~~~~~~~~~~~~~~~~~

- `__init__` has been removed from the :class:`base.SamplerMixin` to
create a real mixin class. By `Guillaume Lemaitre`_.
- creation of a module `exceptions` to handle consistant raising of
errors. By `Guillaume Lemaitre`_.
- creation of a module `utils.validation` to make checking of
recurrent patterns. By `Guillaume Lemaitre`_.


.. _changes_0_2:

Version 0.2
Expand Down Expand Up @@ -32,7 +62,6 @@ New features

- Added AllKNN under sampling technique. By `Dayvid Oliveira`_.
- Added a module `metrics` implementing some specific scoring function for the problem of balancing. By `Guillaume Lemaitre`_ and `Christos Aridas`_.
- Turn off steps in :class:`pipeline.Pipeline` using the `None` object. By `Christos Aridas`_.

Enhancement
~~~~~~~~~~~
Expand All @@ -42,12 +71,6 @@ Enhancement
- Change from `cross_validation` module to `model_selection` module for
`sklearn` deprecation cycle. By `Dayvid Oliveira`_ and `Christos Aridas`_.

New features
~~~~~~~~~~~~

- Added AllKNN under sampling technique.
- Added support for bumpversion.

API changes summary
~~~~~~~~~~~~~~~~~~~

Expand Down
9 changes: 7 additions & 2 deletions imblearn/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,18 @@
ensemble
Module which provides methods generating an ensemble of
under-sampled subsets.
exceptions
Module including custom warnings and error clases used across
imbalanced-learn.
metrics
Module which provides metrics to quantified the classification performance
with imbalanced dataset.
over_sampling
Module which provides methods to under-sample a dataset.
under-sampling
Module which provides methods to over-sample a dataset.
utils
Module including various utilities.
pipeline
Module which allowing to create pipeline with scikit-learn estimators.
"""
Expand All @@ -34,6 +39,6 @@

# list all submodules available in imblearn and version
__all__ = [
'combine', 'ensemble', 'metrics', 'over_sampling', 'under_sampling',
'pipeline', '__version__'
'combine', 'ensemble', 'exceptions', 'metrics', 'over_sampling',
'under_sampling', 'utils', 'pipeline', '__version__'
]
47 changes: 26 additions & 21 deletions imblearn/base.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
"""Base class for sampling"""

from __future__ import division, print_function
from __future__ import division

import logging
import warnings
from numbers import Real
from abc import ABCMeta, abstractmethod
from collections import Counter

Expand All @@ -12,6 +13,7 @@
from sklearn.externals import six
from sklearn.utils import check_X_y
from sklearn.utils.multiclass import type_of_target
from sklearn.utils.validation import check_is_fitted


class SamplerMixin(six.with_metaclass(ABCMeta, BaseEstimator)):
Expand Down Expand Up @@ -62,16 +64,9 @@ def fit(self, X, y):
self.logger.info('Compute classes statistics ...')

# Raise an error if there is only one class
# if uniques.size == 1:
# raise RuntimeError("Only one class detected, aborting...")
# Raise a warning for the moment to be compatible with BaseEstimator
self.logger.debug('The number of classes is %s', np.unique(y).size)
self.logger.debug('Shall we raise a warning: %s',
np.unique(y).size == 1)
if np.unique(y).size == 1:
warnings.simplefilter('always', UserWarning)
warnings.warn('Only one class detected, something will get wrong')
self.logger.debug('The warning should has been raised.')
if np.unique(y).size <= 1:
raise ValueError("Sampler can't balance when only one class is"
" present.")

# Store the size of X to check at sampling time if we have the
# same data
Expand All @@ -88,12 +83,16 @@ def fit(self, X, y):
np.unique(y).size, self.stats_c_)

# Check if the ratio provided at initialisation make sense
if isinstance(self.ratio, float):
if isinstance(self.ratio, Real):
if self.ratio < (self.stats_c_[self.min_c_] /
self.stats_c_[self.maj_c_]):
raise RuntimeError('The ratio requested at initialisation'
' should be greater or equal than the'
' balancing ratio of the current data.')
' balancing ratio of the current data.'
' Got {} < {}.'.format(
self.ratio,
self.stats_c_[self.min_c_] /
self.stats_c_[self.maj_c_]))

return self

Expand Down Expand Up @@ -122,14 +121,14 @@ def sample(self, X, y):
X, y = check_X_y(X, y)

# Check that the data have been fitted
if not hasattr(self, 'stats_c_'):
raise RuntimeError('You need to fit the data, first!!!')
check_is_fitted(self, 'stats_c_')

# Check if the size of the data is identical than at fitting
if X.shape != self.X_shape_:
raise RuntimeError('The data that you attempt to resample do not'
' seem to be the one earlier fitted. Use the'
' fitted data.')
' fitted data. Shape of data is {}, got {}'
' instead.'.format(X.shape, self.X_shape_))

if hasattr(self, 'ratio'):
self._validate_ratio()
Expand Down Expand Up @@ -170,17 +169,23 @@ def _validate_ratio(self):
# The ratio correspond to the number of samples in the minority class
# over the number of samples in the majority class. Thus, the ratio
# cannot be greater than 1.0
if isinstance(self.ratio, float):
if isinstance(self.ratio, Real):
if self.ratio > 1:
raise ValueError('Ration cannot be greater than one.')
raise ValueError('Ratio cannot be greater than one.'
' Got {}.'.format(self.ratio))
elif self.ratio <= 0:
raise ValueError('Ratio cannot be negative.')
raise ValueError('Ratio cannot be negative.'
' Got {}.'.format(self.ratio))

elif isinstance(self.ratio, six.string_types):
if self.ratio != 'auto':
raise ValueError('Unknown string for the parameter ratio.')
raise ValueError("Unknown string for the parameter ratio."
" Got {} instead of 'auto'".format(
self.ratio))
else:
raise ValueError('Unknown parameter type for ratio.')
raise ValueError('Unknown parameter type for ratio.'
' Got {} instead of float or str'.format(
type(self.ratio)))

def _validate_size_ngh_deprecation(self):
"Private function to warn about the deprecation about size_ngh."
Expand Down
6 changes: 4 additions & 2 deletions imblearn/combine/smote_enn.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,8 @@ def _validate_estimator(self):
if isinstance(self.smote, SMOTE):
self.smote_ = self.smote
else:
raise ValueError('smote needs to be a SMOTE object.')
raise ValueError('smote needs to be a SMOTE object.'
'Got {} instead.'.format(type(self.smote)))
# Otherwise create a default SMOTE
else:
self.smote_ = SMOTE(
Expand Down Expand Up @@ -234,7 +235,8 @@ def _validate_estimator(self):
if isinstance(self.enn, EditedNearestNeighbours):
self.enn_ = self.enn
else:
raise ValueError('enn needs to be an EditedNearestNeighbours.')
raise ValueError('enn needs to be an EditedNearestNeighbours.'
' Got {} instead.'.format(type(self.enn)))
# Otherwise create a default EditedNearestNeighbours
else:
self.enn_ = EditedNearestNeighbours(random_state=self.random_state)
Expand Down
8 changes: 5 additions & 3 deletions imblearn/combine/smote_tomek.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,8 @@ def _validate_estimator(self):
if isinstance(self.smote, SMOTE):
self.smote_ = self.smote
else:
raise ValueError('smote needs to be a SMOTE object.')
raise ValueError('smote needs to be a SMOTE object.'
'Got {} instead.'.format(type(self.smote)))
# Otherwise create a default SMOTE
else:
self.smote_ = SMOTE(
Expand All @@ -192,8 +193,9 @@ def _validate_estimator(self):
if isinstance(self.tomek, TomekLinks):
self.tomek_ = self.tomek
else:
raise ValueError('tomek needs to be a TomekLinks object.')
# Otherwise create a default EditedNearestNeighbours
raise ValueError('tomek needs to be a TomekLinks object.'
'Got {} instead.'.format(type(self.tomek)))
# Otherwise create a default TomekLinks
else:
self.tomek_ = TomekLinks(random_state=self.random_state)

Expand Down
Loading