MRG added OneHotEncoder #1279

Merged
merged 23 commits into from Nov 14, 2012
Commits
+267 −3
Split
@@ -846,6 +846,7 @@ Pairwise metrics
preprocessing.LabelEncoder
preprocessing.MinMaxScaler
preprocessing.Normalizer
+ preprocessing.OneHotEncoder
preprocessing.StandardScaler
.. autosummary::
@@ -288,6 +288,48 @@ to be used when the transformer API is not necessary.
To avoid unnecessary memory copies, it is recommended to choose the CSR
representation upstream.
+
+Encoding categorical features
+=============================
+Often features are not given as continuous values but categorical.
+For example a person could have features ``["male", "female"]``,
+``["from Europe", "from US", "from Asia"]``,
+``["uses Firefox", "uses Chrome", "uses Safari", "uses Internet Explorer"]``.
+Such features can be efficiently coded as integers, for instance
+``["male", "from US", "uses Internet Explorer"]`` could be expressed as
+``[0, 1, 3]`` while ``["female", "from Asia", "uses Chrome"]`` would be
+``[1, 2, 1]``.
+
+Such integer representation can not be used directly with scikit-learn estimators, as these
+expect continuous input, and would interpret the categories as being ordered, which is often
+not desired (i.e. the set of browsers was ordered arbitrarily).
+
+One possibility to convert categorical features to features that can be used
+with scikit-learn estimators is to use a one-of-K or one-hot encoding, which is
+implemented in :class:`OneHotEncoder`. This estimator transforms each
+categorical feature with ``m`` possible values into ``m`` binary features, with
+only one active.
+
+Continuing the example above::
+
+ >>> enc = preprocessing.OneHotEncoder()
+ >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])
+ OneHotEncoder(dtype=<type 'float'>, n_values='auto')
+ >>> enc.transform([[0, 1, 3]]).toarray()
+ array([[ 1., 0., 0., 1., 0., 0., 0., 0., 1.]])
+
+By default, how many values each feature can take is inferred automatically from the dataset.
+It is possible to specify this explicitly using the parameter ``n_values``.
+There are two genders, three possible continents and four web browsers in our
+dataset.
+Then we fit the estimator, and transform a data point.
+In the result, the first two numbers encode the gender, the next set of three
+numbers the continent and the last four the web browser.
+
+See :ref:`dict_feature_extraction` for categorical features that are represented
+as a dict, not as integers.
+
+
Label preprocessing
===================
View
@@ -41,7 +41,7 @@ Changelog
of several transformers by `Andreas Müller`_.
- Faster and more robust :func:`metrics.confusion_matrix` and
- :ref:`clustering_evaluation`_ by Wei Li.
+ :ref:`clustering_evaluation` by Wei Li.
- New estimator :class:`decomposition.FactorAnalysis` by
`Christian Osendorfer`_ and `Alexandre Gramfort`_
@@ -55,6 +55,9 @@ Changelog
regressors too correlated as well as to stop the path when
numerical noise becomes predominant, by `Gael Varoquaux`_.
+ - New estimator :class:`preprocessing.OneHotEncoder` to compute
+ binary encodings of categorical features by `Andreas Müller`_.
@ogrisel
ogrisel Nov 10, 2012 Member

This item is there twice.

+
API changes summary
-------------------
@@ -1,9 +1,12 @@
# Authors: Alexandre Gramfort <alexandre.gramfort@inria.fr>
# Mathieu Blondel <mathieu@mblondel.org>
# Olivier Grisel <olivier.grisel@ensta.org>
+# Andreas Mueller <amueller@ais.uni-bonn.de>
# License: BSD
+
from collections import Sequence
import warnings
+import numbers
import numpy as np
import scipy.sparse as sp
@@ -602,6 +605,167 @@ def _is_multilabel(y):
or _is_label_indicator_matrix(y)
+class OneHotEncoder(BaseEstimator, TransformerMixin):
+ """Encode categorical integer features using a one-hot aka one-of-K scheme.
+
+ The input to this transformer should be a matrix of integers, denoting
+ the values taken on by categorical (discrete) features. The output will be
+ a sparse matrix were each column corresponds to one possible value of one
+ feature. It is assumed that input features take on values in the range
+ [0, n_values).
+
+ This encoding is needed for feeding categorical data to scikit-learn
+ estimators.
+
+ Parameters
+ ----------
+ n_values : 'auto', int or array of int
+ Number of values per feature.
+ 'auto' : determine value range from training data.
+ int : maximum value for all features.
+ array : maximum value per feature.
+
+ dtype : number type, default=np.float
+ Desired dtype of output.
+
+ Attributes
+ ----------
+ `active_features_` : array
+ Indices for active features, meaning values that actually occur in the
+ training set. Only available when n_values is ``'auto'``.
+ `feature_indices_` : array of shape (n_features,)
+ Indices to feature ranges. Feature ``i`` in the original data is mapped
+ to features ``feature_indices_[i]`` to ``feature_indices_[i+1]``
+ (and potentially masked by `active_features_` afterwards)
+ `n_values_` : array of shape (n_features,)
+ Maximum number of values per feature.
+
+ Examples
+ --------
+ Given a dataset with three features and two samples, we let the encoder
+ find the maximum value per feature and transform the data to a binary
+ one-hot encoding.
+
+ >>> from sklearn.preprocessing import OneHotEncoder
+ >>> enc = OneHotEncoder()
+ >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])
+ OneHotEncoder(dtype=<type 'float'>, n_values='auto')
+ >>> enc.n_values_
+ array([2, 3, 4])
+ >>> enc.feature_indices_
+ array([0, 2, 5, 9])
+ >>> enc.transform([[0, 1, 1]]).toarray()
+ array([[ 1., 0., 0., 1., 0., 0., 1., 0., 0.]])
+
+ See also
+ --------
+ LabelEncoder : performs a one-hot encoding on arbitrary class labels.
+ sklearn.feature_extraction.DictVectorizer : performs a one-hot encoding of
+ dictionary items (also handles string-valued features).
+ """
+ def __init__(self, n_values="auto", dtype=np.float):
+ self.n_values = n_values
+ self.dtype = dtype
+
+ def fit(self, X, y=None):
+ """Fit OneHotEncoder to X.
+
+ Parameters
+ ----------
+ X : array-like, shape=(n_samples, n_feature)
+ Input array of type int.
+
+ Returns
+ -------
+ self
+ """
+ self.fit_transform(X)
+ return self
+
+ def fit_transform(self, X, y=None):
+ """Fit OneHotEncoder to X, then transform X.
+
+ Equivalent to self.fit(X).transform(X), but more convenient and more
+ efficient. See fit for the parameters, transform for the return value.
+ """
+ X = check_arrays(X, sparse_format='dense', dtype=np.int)[0]
+ if np.any(X < 0):
+ raise ValueError("X needs to contain only non-negative integers.")
+ n_samples, n_features = X.shape
+ if self.n_values == 'auto':
+ n_values = np.max(X, axis=0) + 1
+ elif isinstance(self.n_values, numbers.Integral):
+ n_values = np.empty(n_features, dtype=np.int)
+ n_values.fill(self.n_values)
+ else:
+ try:
+ n_values = np.asarray(self.n_values, dtype=int)
+ except (ValueError, TypeError):
+ raise TypeError("Wrong type for parameter `n_values`."
+ " Expected 'auto', int or array of ints, got %r"
+ % type(X))
+ if n_values.ndim < 1 or n_values.shape[0] != X.shape[1]:
+ raise ValueError("Shape mismatch: if n_values is "
+ "an array, it has to be of shape (n_features,).")
+ self.n_values_ = n_values
+ n_values = np.hstack([[0], n_values])
+ indices = np.cumsum(n_values)
+ self.feature_indices_ = indices
+
+ column_indices = (X + indices[:-1]).ravel()
+ row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),
+ n_features)
+ data = np.ones(n_samples * n_features)
+ out = sp.coo_matrix((data, (row_indices, column_indices)),
+ shape=(n_samples, indices[-1]), dtype=self.dtype).tocsr()
+
+ if self.n_values == 'auto':
+ mask = np.array(out.sum(axis=0)).ravel() != 0
+ active_features = np.where(mask)[0]
+ out = out[:, active_features]
+ self.active_features_ = active_features
+
+ return out
+
+ def transform(self, X):
+ """Transform X using one-hot encoding.
+
+ Parameters
+ ----------
+ X : array-like, shape=(n_samples, feature_indices_[-1])
+ Input array of type int.
+
+ Returns
+ -------
+ X_out : sparse matrix, dtype=int
+ Transformed input.
+ """
+ X = check_arrays(X, sparse_format='dense', dtype=np.int)[0]
+ if np.any(X < 0):
+ raise ValueError("X needs to contain only non-negative integers.")
+ n_samples, n_features = X.shape
+
+ indices = self.feature_indices_
+ if n_features != indices.shape[0] - 1:
+ raise ValueError("X has different shape than during fitting."
+ " Expected %d, got %d."
+ % (indices.shape[0] - 1, n_features))
+
+ n_values_check = np.max(X, axis=0) + 1
+ if (n_values_check > self.n_values_).any():
+ raise ValueError("Feature out of bounds. Try setting n_values.")
+
+ column_indices = (X + indices[:-1]).ravel()
+ row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),
+ n_features)
+ data = np.ones(n_samples * n_features)
+ out = sp.coo_matrix((data, (row_indices, column_indices)),
+ shape=(n_samples, indices[-1]), dtype=self.dtype).tocsr()
+ if self.n_values == 'auto':
+ out = out[:, self.active_features_]
+ return out
+
+
class LabelEncoder(BaseEstimator, TransformerMixin):
"""Encode labels with value between 0 and n_classes-1.
@@ -43,14 +43,14 @@
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.kernel_approximation import AdditiveChi2Sampler
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, Binarizer, \
- Normalizer
+ Normalizer, OneHotEncoder
from sklearn.cluster import WardAgglomeration, AffinityPropagation, \
SpectralClustering
from sklearn.linear_model import IsotonicRegression
dont_test = [Pipeline, FeatureUnion, GridSearchCV, SparseCoder,
EllipticEnvelope, EllipticEnvelop, DictVectorizer, LabelBinarizer,
- LabelEncoder, TfidfTransformer, IsotonicRegression]
+ LabelEncoder, TfidfTransformer, IsotonicRegression, OneHotEncoder]
meta_estimators = [BaseEnsemble, OneVsOneClassifier, OutputCodeClassifier,
OneVsRestClassifier, RFE, RFECV]
@@ -14,6 +14,7 @@
from sklearn.preprocessing import Binarizer
from sklearn.preprocessing import KernelCenterer
from sklearn.preprocessing import LabelBinarizer
+from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import normalize
@@ -448,6 +449,59 @@ def test_label_binarizer_errors():
assert_raises(ValueError, LabelBinarizer, neg_label=2, pos_label=2)
+def test_one_hot_encoder():
+ """Test OneHotEncoder's fit and transform."""
+ X = [[3, 2, 1], [0, 1, 1]]
+ enc = OneHotEncoder()
+ # discover max values automatically
+ X_trans = enc.fit_transform(X).toarray()
+ assert_equal(X_trans.shape, (2, 5))
+ assert_array_equal(enc.active_features_,
+ np.where([1, 0, 0, 1, 0, 1, 1, 0, 1])[0])
+ assert_array_equal(enc.feature_indices_, [0, 4, 7, 9])
+
+ # check outcome
+ assert_array_equal(X_trans,
+ [[0., 1., 0., 1., 1.],
+ [1., 0., 1., 0., 1.]])
+
+ # max value given as 3
+ enc = OneHotEncoder(n_values=4)
+ X_trans = enc.fit_transform(X)
+ assert_equal(X_trans.shape, (2, 4 * 3))
+ assert_array_equal(enc.feature_indices_, [0, 4, 8, 12])
+
+ # max value given per feature
+ enc = OneHotEncoder(n_values=[3, 2, 2])
+ X = [[1, 0, 1], [0, 1, 1]]
+ X_trans = enc.fit_transform(X)
+ assert_equal(X_trans.shape, (2, 3 + 2 + 2))
+ assert_array_equal(enc.n_values_, [3, 2, 2])
+ # check that testing with larger feature works:
+ X = np.array([[2, 0, 1], [0, 1, 1]])
+ enc.transform(X)
+
+ # test that an error is raise when out of bounds:
+ X_too_large = [[0, 2, 1], [0, 1, 1]]
+ assert_raises(ValueError, enc.transform, X_too_large)
+
+ # test that error is raised when wrong number of features
+ assert_raises(ValueError, enc.transform, X[:, :-1])
+ # test that error is raised when wrong number of features in fit
+ # with prespecified n_values
+ assert_raises(ValueError, enc.fit, X[:, :-1])
+ # test exception on wrong init param
+ assert_raises(TypeError, OneHotEncoder(n_values=np.int).fit, X)
+
+ enc = OneHotEncoder()
+ # test negative input to fit
+ assert_raises(ValueError, enc.fit, [[0], [-1]])
+
+ # test negative input to transform
+ enc.fit([[0], [1]])
+ assert_raises(ValueError, enc.transform, [[0], [-1]])
+
+
def test_label_encoder():
"""Test LabelEncoder's transform and inverse_transform methods"""
le = LabelEncoder()