Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature names - NamedArray #14315

Closed
wants to merge 19 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion sklearn/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,14 @@
check_random_state, column_or_1d, check_array,
check_consistent_length, check_X_y, indexable,
check_symmetric, check_scalar)
from ._namedarray import NamedArray, make_namedarray
from ._namedarray import (SparseNamedArrayCSR,
SparseNamedArrayCSC,
SparseNamedArrayBSR,
SparseNamedArrayLIL,
SparseNamedArrayDOK,
SparseNamedArrayDIA,
SparseNamedArrayCOO)
from .. import get_config


Expand All @@ -51,7 +59,10 @@
"check_symmetric", "indices_to_mask", "deprecated",
"parallel_backend", "register_parallel_backend",
"resample", "shuffle", "check_matplotlib_support", "all_estimators",
]
"NamedArray", "make_namedarray", "SparseNamedArrayCSR",
"SparseNamedArrayCSC", "SparseNamedArrayBSR",
"SparseNamedArrayLIL", "SparseNamedArrayDOK",
"SparseNamedArrayDIA", "SparseNamedArrayCOO"]

IS_PYPY = platform.python_implementation() == 'PyPy'
_IS_32BIT = 8 * struct.calcsize("P") == 32
Expand Down
164 changes: 164 additions & 0 deletions sklearn/utils/_namedarray.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
# -*- coding: utf-8 -*-
# Authors: Adrin Jalali <adrin.jalali@gmail.com>
#
# License: BSD 3 clause

import numpy as np
import scipy as sp

# NDArrayOperatorsMixin was added in numpy 1.13
# TODO: cleanup once we support numpy 1.13+
try:
from numpy.lib.mixins import NDArrayOperatorsMixin
except ImportError:
raise NotImplementedError("In order to use NamedAraay, please upgrade your"
" numpy to 1.13+!")

from .validation import check_array, column_or_1d


class FeatureNamesMixin:
@property
def feature_names(self):
return self._feature_names

@feature_names.setter
def feature_names(self, value):
if value is None:
self._feature_names = None
return

if np.isscalar(value):
value = [value]
value = column_or_1d(value)
col_count = self._col_count(self._data)
if len(value) != col_count:
raise ValueError("{} column names provided, but data has {} "
"columns".format(len(value), col_count))

self._feature_names = value

def _col_count(self, value):
if value.ndim == 1:
return 1
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not tested

else:
return value.shape[1]


class NamedArray(FeatureNamesMixin, NDArrayOperatorsMixin):
"""A wrapper to a numpy ndarray holding some metadata about the data.

Instances of this object behave like a numpy array, and lose all metadata
information in numerical operations.

Parameters
----------
data: array-like
A one or two dimensional array like data.

feature_names: list or array of strings, or None, default=None
Feature names associated with the columns of the data. The number of
columns should always be the same as the number of feature names.
Setting the `data` of an instance, would result in `feature_names` to
be `None` if the number of columns do not match the number of stored
feature names.
"""

def __init__(self, data, feature_names=None):
if hasattr(data, 'columns') and feature_names is None:
feature_names = list(data.columns)
data = check_array(data, ensure_2d=False)
self._data = data
self.feature_names = feature_names

def __getattr__(self, name):
return getattr(self._data, name)

def __dir__(self):
return list(set(dir(NamedArray)).union(set(dir(self._data))))

def __getitem__(self, slice):
return self._data[slice]

def __repr__(self):
prefix = self.__class__.__name__ + '('
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not tested

base_repr = np.array2string(self._data,
prefix=prefix)
return (prefix + base_repr
+ ',\n feature_names={})'.format(
str(self.feature_names)))

def todataframe(self):
"""Returns a `pandas.DataFrame` with set column names."""
import pandas as pd
return pd.DataFrame(self._data, columns=self.feature_names)


class SparseNamedArrayMixin(FeatureNamesMixin):
def __init__(self, *args, feature_names=None, **kwargs):
super().__init__(*args, **kwargs)
self._data = self
self.feature_names = feature_names

def __repr__(self):
res = super().__repr__()
res += "\nfeature names: %s" % repr(self._feature_names)
return res

def todataframe(self):
"""Returns a `pandas.DataFrame` with set column names."""
import pandas as pd
return pd.DataFrame.sparse.from_spmatrix(self,
columns=self.feature_names)


# We need a class per sparse matrix type, hence the following 7 classes.
class SparseNamedArrayCSR(SparseNamedArrayMixin, sp.sparse.csr_matrix):
pass


class SparseNamedArrayCSC(SparseNamedArrayMixin, sp.sparse.csc_matrix):
pass


class SparseNamedArrayBSR(SparseNamedArrayMixin, sp.sparse.bsr_matrix):
pass


class SparseNamedArrayLIL(SparseNamedArrayMixin, sp.sparse.lil_matrix):
pass


class SparseNamedArrayDOK(SparseNamedArrayMixin, sp.sparse.dok_matrix):
pass


class SparseNamedArrayDIA(SparseNamedArrayMixin, sp.sparse.dia_matrix):
pass


class SparseNamedArrayCOO(SparseNamedArrayMixin, sp.sparse.coo_matrix):
pass


def make_namedarray(X, feature_names=None, force_sparse=None):
types = {'csr': SparseNamedArrayCSR,
'csc': SparseNamedArrayCSC,
'bsr': SparseNamedArrayBSR,
'lil': SparseNamedArrayLIL,
'dok': SparseNamedArrayDOK,
'dia': SparseNamedArrayDIA,
'coo': SparseNamedArrayCOO}
if hasattr(X, 'columns') and feature_names is None:
feature_names = list(X.columns)

format = None
if sp.sparse.issparse(X):
format = X.format
elif force_sparse:
format = force_sparse

if format:
return types[format](X, feature_names=feature_names, copy=False)
else:
return NamedArray(X, feature_names=feature_names)
56 changes: 56 additions & 0 deletions sklearn/utils/tests/test_namedarray.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import pytest
import numpy as np

from sklearn.utils.testing import assert_array_equal
from sklearn.utils import NamedArray


def test_basics():
x = NamedArray(np.random.rand(5, 3), feature_names=['a', 'b', 'c'])
assert_array_equal(x.feature_names, ['a', 'b', 'c'])
assert not isinstance(x + 1, NamedArray)
assert not isinstance(x + x, NamedArray)
assert not isinstance(x + np.ones(shape=(5, 3)), NamedArray)
assert x[0, :].shape == (3,)
assert x[:, 0].shape == (5,)
assert x[0:2].shape == (2, 3)


def test_validation():
with pytest.raises(ValueError, match="column names provided"):
NamedArray(np.ones(shape=(3, 3)), feature_names=[1])

# allow None as feature_names
NamedArray(np.ones(shape=(3, 3)))


def test_getattr():
x = NamedArray(np.random.rand(5, 3), feature_names=['a', 'b', 'c'])
# these would fail if __getattr__ doesn't work
x.ndim
x.shape


def test_pandas():
_ = pytest.importorskip("pandas")
x = NamedArray(np.random.rand(5, 3), feature_names=['a', 'b', 'c'])
assert all(x.todataframe().columns == ['a', 'b', 'c'])


def test_1d():
x = NamedArray(np.array([1, 2]), feature_names=['a'])
assert x.feature_names == ['a']


def test_repr():
repr_ = ("NamedArray([[1 2]\n"
" [3 4]],\n"
" feature_names=['a' 'b'])")
x = NamedArray([[1, 2], [3, 4]], feature_names=['a', 'b'])
assert repr(x) == repr_


def test_numpy_attrs():
a = np.ones(shape=(1))
x = NamedArray(a, feature_names='a')
assert set(dir(a)) < set(dir(x))