scikit-learn · adrinjalali · Jul 29, 2019 · Jul 29, 2019 · Jul 29, 2019 · Jul 29, 2019
diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py
@@ -30,6 +30,14 @@
                          check_random_state, column_or_1d, check_array,
                          check_consistent_length, check_X_y, indexable,
                          check_symmetric, check_scalar)
+from ._namedarray import NamedArray, make_namedarray
+from ._namedarray import (SparseNamedArrayCSR,
+                          SparseNamedArrayCSC,
+                          SparseNamedArrayBSR,
+                          SparseNamedArrayLIL,
+                          SparseNamedArrayDOK,
+                          SparseNamedArrayDIA,
+                          SparseNamedArrayCOO)
 from .. import get_config
 
 
@@ -51,7 +59,10 @@
            "check_symmetric", "indices_to_mask", "deprecated",
            "parallel_backend", "register_parallel_backend",
            "resample", "shuffle", "check_matplotlib_support", "all_estimators",
-           ]
+           "NamedArray", "make_namedarray", "SparseNamedArrayCSR",
+           "SparseNamedArrayCSC", "SparseNamedArrayBSR",
+           "SparseNamedArrayLIL", "SparseNamedArrayDOK",
+           "SparseNamedArrayDIA", "SparseNamedArrayCOO"]
 
 IS_PYPY = platform.python_implementation() == 'PyPy'
 _IS_32BIT = 8 * struct.calcsize("P") == 32

diff --git a/sklearn/utils/_namedarray.py b/sklearn/utils/_namedarray.py
@@ -0,0 +1,164 @@
+# -*- coding: utf-8 -*-
+# Authors: Adrin Jalali <adrin.jalali@gmail.com>
+#
+# License: BSD 3 clause
+
+import numpy as np
+import scipy as sp
+
+# NDArrayOperatorsMixin was added in numpy 1.13
+# TODO: cleanup once we support numpy 1.13+
+try:
+    from numpy.lib.mixins import NDArrayOperatorsMixin
+except ImportError:
+    raise NotImplementedError("In order to use NamedAraay, please upgrade your"
+                              " numpy to 1.13+!")
+
+from .validation import check_array, column_or_1d
+
+
+class FeatureNamesMixin:
+    @property
+    def feature_names(self):
+        return self._feature_names
+
+    @feature_names.setter
+    def feature_names(self, value):
+        if value is None:
+            self._feature_names = None
+            return
+
+        if np.isscalar(value):
+            value = [value]
+        value = column_or_1d(value)
+        col_count = self._col_count(self._data)
+        if len(value) != col_count:
+            raise ValueError("{} column names provided, but data has {} "
+                             "columns".format(len(value), col_count))
+
+        self._feature_names = value
+
+    def _col_count(self, value):
+        if value.ndim == 1:
+            return 1
+        else:
+            return value.shape[1]
+
+
+class NamedArray(FeatureNamesMixin, NDArrayOperatorsMixin):
+    """A wrapper to a numpy ndarray holding some metadata about the data.
+
+    Instances of this object behave like a numpy array, and lose all metadata
+    information in numerical operations.
+
+    Parameters
+    ----------
+    data: array-like
+        A one or two dimensional array like data.
+
+    feature_names: list or array of strings, or None, default=None
+        Feature names associated with the columns of the data. The number of
+        columns should always be the same as the number of feature names.
+        Setting the `data` of an instance, would result in `feature_names` to
+        be `None` if the number of columns do not match the number of stored
+        feature names.
+    """
+
+    def __init__(self, data, feature_names=None):
+        if hasattr(data, 'columns') and feature_names is None:
+            feature_names = list(data.columns)
+        data = check_array(data, ensure_2d=False)
+        self._data = data
+        self.feature_names = feature_names
+
+    def __getattr__(self, name):
+        return getattr(self._data, name)
+
+    def __dir__(self):
+        return list(set(dir(NamedArray)).union(set(dir(self._data))))
+
+    def __getitem__(self, slice):
+        return self._data[slice]
+
+    def __repr__(self):
+        prefix = self.__class__.__name__ + '('
+        base_repr = np.array2string(self._data,
+                                    prefix=prefix)
+        return (prefix + base_repr
+                + ',\n           feature_names={})'.format(
+                    str(self.feature_names)))
+
+    def todataframe(self):
+        """Returns a `pandas.DataFrame` with set column names."""
+        import pandas as pd
+        return pd.DataFrame(self._data, columns=self.feature_names)
+
+
+class SparseNamedArrayMixin(FeatureNamesMixin):
+    def __init__(self, *args, feature_names=None, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._data = self
+        self.feature_names = feature_names
+
+    def __repr__(self):
+        res = super().__repr__()
+        res += "\nfeature names: %s" % repr(self._feature_names)
+        return res
+
+    def todataframe(self):
+        """Returns a `pandas.DataFrame` with set column names."""
+        import pandas as pd
+        return pd.DataFrame.sparse.from_spmatrix(self,
+                                                 columns=self.feature_names)
+
+
+# We need a class per sparse matrix type, hence the following 7 classes.
+class SparseNamedArrayCSR(SparseNamedArrayMixin, sp.sparse.csr_matrix):
+    pass
+
+
+class SparseNamedArrayCSC(SparseNamedArrayMixin, sp.sparse.csc_matrix):
+    pass
+
+
+class SparseNamedArrayBSR(SparseNamedArrayMixin, sp.sparse.bsr_matrix):
+    pass
+
+
+class SparseNamedArrayLIL(SparseNamedArrayMixin, sp.sparse.lil_matrix):
+    pass
+
+
+class SparseNamedArrayDOK(SparseNamedArrayMixin, sp.sparse.dok_matrix):
+    pass
+
+
+class SparseNamedArrayDIA(SparseNamedArrayMixin, sp.sparse.dia_matrix):
+    pass
+
+
+class SparseNamedArrayCOO(SparseNamedArrayMixin, sp.sparse.coo_matrix):
+    pass
+
+
+def make_namedarray(X, feature_names=None, force_sparse=None):
+    types = {'csr': SparseNamedArrayCSR,
+             'csc': SparseNamedArrayCSC,
+             'bsr': SparseNamedArrayBSR,
+             'lil': SparseNamedArrayLIL,
+             'dok': SparseNamedArrayDOK,
+             'dia': SparseNamedArrayDIA,
+             'coo': SparseNamedArrayCOO}
+    if hasattr(X, 'columns') and feature_names is None:
+        feature_names = list(X.columns)
+
+    format = None
+    if sp.sparse.issparse(X):
+        format = X.format
+    elif force_sparse:
+        format = force_sparse
+
+    if format:
+        return types[format](X, feature_names=feature_names, copy=False)
+    else:
+        return NamedArray(X, feature_names=feature_names)
diff --git a/sklearn/utils/tests/test_namedarray.py b/sklearn/utils/tests/test_namedarray.py
@@ -0,0 +1,56 @@
+import pytest
+import numpy as np
+
+from sklearn.utils.testing import assert_array_equal
+from sklearn.utils import NamedArray
+
+
+def test_basics():
+    x = NamedArray(np.random.rand(5, 3), feature_names=['a', 'b', 'c'])
+    assert_array_equal(x.feature_names, ['a', 'b', 'c'])
+    assert not isinstance(x + 1, NamedArray)
+    assert not isinstance(x + x, NamedArray)
+    assert not isinstance(x + np.ones(shape=(5, 3)), NamedArray)
+    assert x[0, :].shape == (3,)
+    assert x[:, 0].shape == (5,)
+    assert x[0:2].shape == (2, 3)
+
+
+def test_validation():
+    with pytest.raises(ValueError, match="column names provided"):
+        NamedArray(np.ones(shape=(3, 3)), feature_names=[1])
+
+    # allow None as feature_names
+    NamedArray(np.ones(shape=(3, 3)))
+
+
+def test_getattr():
+    x = NamedArray(np.random.rand(5, 3), feature_names=['a', 'b', 'c'])
+    # these would fail if __getattr__ doesn't work
+    x.ndim
+    x.shape
+
+
+def test_pandas():
+    _ = pytest.importorskip("pandas")
+    x = NamedArray(np.random.rand(5, 3), feature_names=['a', 'b', 'c'])
+    assert all(x.todataframe().columns == ['a', 'b', 'c'])
+
+
+def test_1d():
+    x = NamedArray(np.array([1, 2]), feature_names=['a'])
+    assert x.feature_names == ['a']
+
+
+def test_repr():
+    repr_ = ("NamedArray([[1 2]\n"
+             "            [3 4]],\n"
+             "           feature_names=['a' 'b'])")
+    x = NamedArray([[1, 2], [3, 4]], feature_names=['a', 'b'])
+    assert repr(x) == repr_
+
+
+def test_numpy_attrs():
+    a = np.ones(shape=(1))
+    x = NamedArray(a, feature_names='a')
+    assert set(dir(a)) < set(dir(x))