Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] EHN add parameter axis in safe_indexing to slice rows and columns #14035

Merged
merged 18 commits into from Jul 2, 2019
Merged
9 changes: 8 additions & 1 deletion doc/whats_new/v0.22.rst
Expand Up @@ -121,7 +121,14 @@ Changelog
remove constant features due to numerical instability, by using range
rather than variance in this case.
:pr:`13704` by `Roddy MacSween <rlms>`.


:mod:`sklearn.utils`
....................

- |Enhancement| :func:`utils.safe_indexing` accepts an ``axis`` parameter to
index array-like across rows and columns. The column indexing can be done on
NumPy array, SciPy sparse matrix, and Pandas DataFrame.
:pr:`14035` by `Guillaume Lemaitre <glemaitre>`.

:mod:`sklearn.neural_network`
.............................
Expand Down
130 changes: 3 additions & 127 deletions sklearn/compose/_column_transformer.py
Expand Up @@ -17,6 +17,8 @@
from ..pipeline import _fit_transform_one, _transform_one, _name_estimators
from ..preprocessing import FunctionTransformer
from ..utils import Bunch
from ..utils import safe_indexing
from ..utils import _get_column_indices
from ..utils.metaestimators import _BaseComposition
from ..utils.validation import check_array, check_is_fitted

Expand Down Expand Up @@ -402,7 +404,7 @@ def _fit_transform(self, X, y, func, fitted=False):
return Parallel(n_jobs=self.n_jobs)(
delayed(func)(
transformer=clone(trans) if not fitted else trans,
X=_get_column(X, column),
X=safe_indexing(X, column, axis=1),
y=y,
weight=weight,
message_clsname='ColumnTransformer',
Expand Down Expand Up @@ -553,132 +555,6 @@ def _check_X(X):
return check_array(X, force_all_finite='allow-nan', dtype=np.object)


def _check_key_type(key, superclass):
"""
Check that scalar, list or slice is of a certain type.

This is only used in _get_column and _get_column_indices to check
if the `key` (column specification) is fully integer or fully string-like.

Parameters
----------
key : scalar, list, slice, array-like
The column specification to check
superclass : int or str
The type for which to check the `key`

"""
if isinstance(key, superclass):
return True
if isinstance(key, slice):
return (isinstance(key.start, (superclass, type(None))) and
isinstance(key.stop, (superclass, type(None))))
if isinstance(key, list):
return all(isinstance(x, superclass) for x in key)
if hasattr(key, 'dtype'):
if superclass is int:
return key.dtype.kind == 'i'
else:
# superclass = str
return key.dtype.kind in ('O', 'U', 'S')
return False


def _get_column(X, key):
"""
Get feature column(s) from input data X.

Supported input types (X): numpy arrays, sparse arrays and DataFrames

Supported key types (key):
- scalar: output is 1D
- lists, slices, boolean masks: output is 2D
- callable that returns any of the above

Supported key data types:

- integer or boolean mask (positional):
- supported for arrays, sparse matrices and dataframes
- string (key-based):
- only supported for dataframes
- So no keys other than strings are allowed (while in principle you
can use any hashable object as key).

"""
# check whether we have string column names or integers
if _check_key_type(key, int):
column_names = False
elif _check_key_type(key, str):
column_names = True
elif hasattr(key, 'dtype') and np.issubdtype(key.dtype, np.bool_):
# boolean mask
column_names = False
if hasattr(X, 'loc'):
# pandas boolean masks don't work with iloc, so take loc path
column_names = True
else:
raise ValueError("No valid specification of the columns. Only a "
"scalar, list or slice of all integers or all "
"strings, or boolean mask is allowed")

if column_names:
if hasattr(X, 'loc'):
# pandas dataframes
return X.loc[:, key]
else:
raise ValueError("Specifying the columns using strings is only "
"supported for pandas DataFrames")
else:
if hasattr(X, 'iloc'):
# pandas dataframes
return X.iloc[:, key]
else:
# numpy arrays, sparse arrays
return X[:, key]


def _get_column_indices(X, key):
"""
Get feature column indices for input data X and key.

For accepted values of `key`, see the docstring of _get_column

"""
n_columns = X.shape[1]

if (_check_key_type(key, int)
or hasattr(key, 'dtype') and np.issubdtype(key.dtype, np.bool_)):
# Convert key into positive indexes
idx = np.arange(n_columns)[key]
return np.atleast_1d(idx).tolist()
elif _check_key_type(key, str):
try:
all_columns = list(X.columns)
except AttributeError:
raise ValueError("Specifying the columns using strings is only "
"supported for pandas DataFrames")
if isinstance(key, str):
columns = [key]
elif isinstance(key, slice):
start, stop = key.start, key.stop
if start is not None:
start = all_columns.index(start)
if stop is not None:
# pandas indexing with strings is endpoint included
stop = all_columns.index(stop) + 1
else:
stop = n_columns + 1
return list(range(n_columns)[slice(start, stop)])
else:
columns = list(key)

return [all_columns.index(col) for col in columns]
else:
raise ValueError("No valid specification of the columns. Only a "
"scalar, list or slice of all integers or all "
"strings, or boolean mask is allowed")


def _is_empty_column_selection(column):
"""
Return True if the column selection is empty (empty list or all-False
Expand Down
196 changes: 193 additions & 3 deletions sklearn/utils/__init__.py
Expand Up @@ -178,22 +178,68 @@ def axis0_safe_slice(X, mask, len_mask):
return np.zeros(shape=(0, X.shape[1]))


def safe_indexing(X, indices):
def safe_indexing(X, indices, axis=0):
glemaitre marked this conversation as resolved.
Show resolved Hide resolved
"""Return rows, items or columns of X using indices.

Parameters
----------
X : array-like, sparse-matrix, list, pandas.DataFrame, pandas.Series
Data from which to sample rows, items or columns.
indices : array-like
- When ``axis=0``, indices need to be an array of integer.
- When ``axis=1``, indices can be one of:
glemaitre marked this conversation as resolved.
Show resolved Hide resolved
- scalar: output is 1D, unless `X` is sparse.
Supported data types for scalars:
- integer: supported for arrays, sparse matrices and
dataframes.
- string (key-based): only supported for dataframes.
- container: lists, slices, boolean masks: output is 2D.
Supported data types for containers:
- integer or boolean (positional): supported for
arrays, sparse matrices and dataframes
- string (key-based): only supported for dataframes. No keys
other than strings are allowed.
axis : int, default=0
The axis along which `X` will be subsampled. ``axis=0`` will select
rows while ``axis=1`` will select columns.

glemaitre marked this conversation as resolved.
Show resolved Hide resolved
Returns
-------
subset
Subset of X on axis 0 or 1.

Notes
-----
CSR, CSC, and LIL sparse matrices are supported. COO sparse matrices are
not supported.
"""
if axis == 0:
glemaitre marked this conversation as resolved.
Show resolved Hide resolved
return _safe_indexing_row(X, indices)
elif axis == 1:
return _safe_indexing_column(X, indices)
else:
raise ValueError(
"'axis' should be either 0 (to index rows) or 1 (to index "
" column). Got {} instead.".format(axis)
)


def _safe_indexing_row(X, indices):
"""Return items or rows from X using indices.

Allows simple indexing of lists or arrays.

Parameters
----------
X : array-like, sparse-matrix, list, pandas.DataFrame, pandas.Series.
X : array-like, sparse-matrix, list, pandas.DataFrame, pandas.Series
Data from which to sample rows or items.
indices : array-like of int
Indices according to which X will be subsampled.

Returns
-------
subset
Subset of X on first axis
Subset of X on first axis.

Notes
-----
Expand All @@ -202,6 +248,7 @@ def safe_indexing(X, indices):
"""
if hasattr(X, "iloc"):
# Work-around for indexing with read-only indices in pandas
indices = np.asarray(indices)
indices = indices if indices.flags.writeable else indices.copy()
# Pandas Dataframes and Series
try:
Expand All @@ -223,6 +270,149 @@ def safe_indexing(X, indices):
return [X[idx] for idx in indices]


def _check_key_type(key, superclass):
"""Check that scalar, list or slice is of a certain type.

This is only used in _safe_indexing_column and _get_column_indices to check
if the ``key`` (column specification) is fully integer or fully
string-like.

Parameters
----------
key : scalar, list, slice, array-like
The column specification to check.
superclass : int or str
The type for which to check the `key`.
"""
if isinstance(key, superclass):
return True
if isinstance(key, slice):
return (isinstance(key.start, (superclass, type(None))) and
isinstance(key.stop, (superclass, type(None))))
if isinstance(key, list):
return all(isinstance(x, superclass) for x in key)
if hasattr(key, 'dtype'):
if superclass is int:
return key.dtype.kind == 'i'
elif superclass is bool:
return key.dtype.kind == 'b'
else:
# superclass = str
return key.dtype.kind in ('O', 'U', 'S')
return False


def _safe_indexing_column(X, key):
"""Get feature column(s) from input data X.

Supported input types (X): numpy arrays, sparse arrays and DataFrames.

Supported key types (key):
- scalar: output is 1D;
- lists, slices, boolean masks: output is 2D.

Supported key data types:
- integer or boolean mask (positional):
- supported for arrays, sparse matrices and dataframes.
- string (key-based):
- only supported for dataframes;
- So no keys other than strings are allowed (while in principle you
can use any hashable object as key).
"""
# check that X is a 2D structure
if X.ndim != 2:
raise ValueError(
"'X' should be a 2D NumPy array, 2D sparse matrix or pandas "
"dataframe when indexing the columns (i.e. 'axis=1'). "
"Got {} instead with {} dimension(s).".format(type(X), X.ndim)
)
# check whether we have string column names or integers
if _check_key_type(key, int):
column_names = False
elif _check_key_type(key, str):
column_names = True
elif hasattr(key, 'dtype') and np.issubdtype(key.dtype, np.bool_):
# boolean mask
column_names = False
if hasattr(X, 'loc'):
# pandas boolean masks don't work with iloc, so take loc path
column_names = True
else:
raise ValueError("No valid specification of the columns. Only a "
"scalar, list or slice of all integers or all "
"strings, or boolean mask is allowed")

if column_names:
if hasattr(X, 'loc'):
# pandas dataframes
return X.loc[:, key]
else:
raise ValueError("Specifying the columns using strings is only "
"supported for pandas DataFrames")
else:
if hasattr(X, 'iloc'):
# pandas dataframes
return X.iloc[:, key]
else:
# numpy arrays, sparse arrays
return X[:, key]


def _get_column_indices(X, key):
"""Get feature column indices for input data X and key.

For accepted values of `key`, see the docstring of
:func:`_safe_indexing_column`.
"""
n_columns = X.shape[1]

if (_check_key_type(key, int)
or hasattr(key, 'dtype') and np.issubdtype(key.dtype, np.bool_)):
glemaitre marked this conversation as resolved.
Show resolved Hide resolved
# Convert key into positive indexes
try:
idx = np.arange(n_columns)[key]
except IndexError as e:
raise ValueError(
'all features must be in [0, %d]' % (n_columns - 1)
) from e
return np.atleast_1d(idx).tolist()
elif _check_key_type(key, str):
try:
all_columns = list(X.columns)
except AttributeError:
raise ValueError("Specifying the columns using strings is only "
"supported for pandas DataFrames")
if isinstance(key, str):
columns = [key]
elif isinstance(key, slice):
start, stop = key.start, key.stop
if start is not None:
start = all_columns.index(start)
if stop is not None:
# pandas indexing with strings is endpoint included
stop = all_columns.index(stop) + 1
else:
stop = n_columns + 1
return list(range(n_columns)[slice(start, stop)])
else:
columns = list(key)

try:
column_indices = [all_columns.index(col) for col in columns]
except ValueError as e:
if 'not in list' in str(e):
raise ValueError(
"A given column is not a column of the dataframe"
) from e
raise

return column_indices
else:
raise ValueError("No valid specification of the columns. Only a "
"scalar, list or slice of all integers or all "
"strings, or boolean mask is allowed")


def resample(*arrays, **options):
"""Resample arrays or sparse matrices in a consistent way

Expand Down