Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions doc/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -118,3 +118,21 @@ Functions

pipeline.make_pipeline


.. _datasets_ref:

Datasets
========

.. automodule:: imblearn.datasets
:no-members:
:no-inherited-members:

.. currentmodule:: imblearn

Functions
---------
.. autosummary::
:toctree: generated/

datasets.make_imbalance
24 changes: 18 additions & 6 deletions imblearn/datasets/imbalance.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,17 @@
"""Transform a dataset into an imbalanced dataset."""

import logging

import numpy as np

from collections import Counter

from sklearn.utils import check_X_y
from sklearn.utils import check_random_state

LOGGER = logging.getLogger(__name__)


def make_imbalance(X, y, ratio, min_c_=None, random_state=None):
"""Turns a dataset into an imbalanced dataset at specific ratio.
A simple toy dataset to visualize clustering and classification
Expand All @@ -20,10 +25,10 @@ def make_imbalance(X, y, ratio, min_c_=None, random_state=None):
y : ndarray, shape (n_samples, )
Corresponding label for each sample in X.

ratio : float,
The desired ratio given by the number of samples in
the minority class over the the number of samples in
the majority class.
ratio : float,
The desired ratio given by the number of samples in
the minority class over the the number of samples in
the majority class. Thus the ratio should be in the interval [0., 1.]

min_c_ : str or int, optional (default=None)
The identifier of the class to be the minority class.
Expand All @@ -42,6 +47,7 @@ def make_imbalance(X, y, ratio, min_c_=None, random_state=None):

y_resampled : ndarray, shape (n_samples_new)
The corresponding label of `X_resampled`

"""
if ratio <= 0.0 or ratio >= 1.0:
raise ValueError('ratio value must be such that 0.0 < ratio < 1.0')
Expand All @@ -52,12 +58,16 @@ def make_imbalance(X, y, ratio, min_c_=None, random_state=None):

stats_c_ = Counter(y)

LOGGER.info('The original target distribution in the dataset is: %s',
stats_c_)

if min_c_ is None:
min_c_ = min(stats_c_, key=stats_c_.get)

n_min_samples = int(np.count_nonzero(y != min_c_) * ratio)
if n_min_samples > stats_c_[min_c_]:
raise ValueError('Current imbalance ratio of data is lower than desired ratio!')
raise ValueError('Current imbalance ratio of data is lower than'
' desired ratio!')
if n_min_samples == 0:
raise ValueError('Not enough samples for desired ratio!')

Expand All @@ -68,7 +78,9 @@ def make_imbalance(X, y, ratio, min_c_=None, random_state=None):
idx_min = random_state.choice(idx_min, size=n_min_samples, replace=False)
idx = np.concatenate((idx_min, idx_maj), axis=0)

X_resampled, y_resampled = X[idx,:], y[idx]
X_resampled, y_resampled = X[idx, :], y[idx]

LOGGER.info('Make the dataset imbalanced: %s', Counter(y_resampled))

return X_resampled, y_resampled