scikit-learn · jnothman · Aug 1, 2020 · Apr 16, 2017 · Apr 16, 2017 · Apr 16, 2017
diff --git a/doc/modules/feature_extraction.rst b/doc/modules/feature_extraction.rst
@@ -56,6 +56,27 @@ is a traditional numerical feature::
   >>> vec.get_feature_names()
   ['city=Dubai', 'city=London', 'city=San Francisco', 'temperature']
 
+:class:`DictVectorizer` accepts multiple string values for one
+feature, like, e.g., multiple categories for a movie.
+
+Assume a database classifies each movie using some categories (not mandatories)
+and its year of release.
+
+    >>> movie_entry = [{'category': ['thriller', 'drama'], 'year': 2003},
+    ...                {'category': ['animation', 'family'], 'year': 2011},
+    ...                {'year': 1974}]
+    >>> vec.fit_transform(movie_entry).toarray()
+    array([[0.000e+00, 1.000e+00, 0.000e+00, 1.000e+00, 2.003e+03],
+           [1.000e+00, 0.000e+00, 1.000e+00, 0.000e+00, 2.011e+03],
+           [0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 1.974e+03]])
+    >>> vec.get_feature_names() == ['category=animation', 'category=drama',
+    ...                             'category=family', 'category=thriller',
+    ...                             'year']
+    True
+    >>> vec.transform({'category': ['thriller'],
+    ...                'unseen_feature': '3'}).toarray()
+    array([[0., 0., 0., 1., 0.]])
+
 :class:`DictVectorizer` is also a useful representation transformation
 for training sequence classifiers in Natural Language Processing models
 that typically work by extracting feature windows around a particular

diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
@@ -162,6 +162,13 @@ Changelog
   :class:`exceptions.NonBLASDotWarning` are deprecated and will be removed in
   v0.26, :pr:`17804` by `Adrin Jalali`_.
 
+:mod:`sklearn.feature_extraction`
+.................................
+
+- |Enhancement| :class:`feature_extraction.DictVectorizer` accepts multiple
+  values for one categorical feature. :pr:`17367` by :user:`Peng Yu <yupbank>`
+  and :user:`Chiara Marmo <cmarmo>`
+
 :mod:`sklearn.feature_selection`
 ................................
 

diff --git a/sklearn/feature_extraction/_dict_vectorizer.py b/sklearn/feature_extraction/_dict_vectorizer.py
@@ -3,8 +3,9 @@
 # License: BSD 3 clause
 
 from array import array
-from collections.abc import Mapping
+from collections.abc import Mapping, Iterable
 from operator import itemgetter
+from numbers import Number
 
 import numpy as np
 import scipy.sparse as sp
@@ -35,10 +36,15 @@ class DictVectorizer(TransformerMixin, BaseEstimator):
     a feature "f" that can take on the values "ham" and "spam" will become two
     features in the output, one signifying "f=ham", the other "f=spam".
 
+    If a feature value is a sequence or set of strings, this transformer
+    will iterate over the values and will count the occurrences of each string
+    value.
+
     However, note that this transformer will only do a binary one-hot encoding
     when feature values are of type string. If categorical features are
-    represented as numeric values such as int, the DictVectorizer can be
-    followed by :class:`~sklearn.preprocessing.OneHotEncoder` to complete
+    represented as numeric values such as int or iterables of strings, the
+    DictVectorizer can be followed by
+    :class:`~sklearn.preprocessing.OneHotEncoder` to complete
     binary one-hot encoding.
 
     Features that do not occur in a sample (mapping) will have a zero value
@@ -78,8 +84,8 @@ class DictVectorizer(TransformerMixin, BaseEstimator):
     >>> X
     array([[2., 0., 1.],
            [0., 1., 3.]])
-    >>> v.inverse_transform(X) == \
-        [{'bar': 2.0, 'foo': 1.0}, {'baz': 1.0, 'foo': 3.0}]
+    >>> v.inverse_transform(X) == [{'bar': 2.0, 'foo': 1.0},
+    ...                            {'baz': 1.0, 'foo': 3.0}]
     True
     >>> v.transform({'foo': 4, 'unseen_feature': 3})
     array([[0., 0., 4.]])
@@ -98,6 +104,28 @@ def __init__(self, *, dtype=np.float64, separator="=", sparse=True,
         self.sparse = sparse
         self.sort = sort
 
+    def _add_iterable_element(self, f, v, feature_names, vocab, *,
+                              fitting=True, transforming=False,
+                              indices=None, values=None):
+        """Add feature names for iterable of strings"""
+        for vv in v:
+            if isinstance(vv, str):
+                feature_name = "%s%s%s" % (f, self.separator, vv)
+                vv = 1
+            else:
+                raise TypeError(f'Unsupported type {type(vv)} in iterable '
+                                'value. Only iterables of string are '
+                                'supported.')
+            if fitting and feature_name not in vocab:
+                vocab[feature_name] = len(feature_names)
+                feature_names.append(feature_name)
+
+            if transforming and feature_name in vocab:
+                indices.append(vocab[feature_name])
+                values.append(self.dtype(vv))
+
+        return
+
     def fit(self, X, y=None):
         """Learn a list of feature name -> indices mappings.
 
@@ -106,6 +134,10 @@ def fit(self, X, y=None):
         X : Mapping or iterable over Mappings
             Dict(s) or Mapping(s) from feature names (arbitrary Python
             objects) to feature values (strings or convertible to dtype).
+
+            .. versionchanged:: 0.24
+               Accepts multiple string values for one categorical feature.
+
         y : (ignored)
 
         Returns
@@ -118,10 +150,22 @@ def fit(self, X, y=None):
         for x in X:
             for f, v in x.items():
                 if isinstance(v, str):
-                    f = "%s%s%s" % (f, self.separator, v)
-                if f not in vocab:
-                    feature_names.append(f)
-                    vocab[f] = len(vocab)
+                    feature_name = "%s%s%s" % (f, self.separator, v)
+                    v = 1
+                elif isinstance(v, Number) or (v is None):
+                    feature_name = f
+                elif isinstance(v, Mapping):
+                    raise TypeError(f'Unsupported value type {type(v)} '
+                                    f'for {f}: {v}.\n'
+                                    'Mapping objects are not supported.')
+                elif isinstance(v, Iterable):
+                    feature_name = None
+                    self._add_iterable_element(f, v, feature_names, vocab)
+
+                if feature_name is not None:
+                    if feature_name not in vocab:
+                        vocab[feature_name] = len(feature_names)
+                        feature_names.append(feature_name)
 
         if self.sort:
             feature_names.sort()
@@ -150,6 +194,8 @@ def _transform(self, X, fitting):
             feature_names = self.feature_names_
             vocab = self.vocabulary_
 
+        transforming = True
+
         # Process everything as sparse regardless of setting
         X = [X] if isinstance(X, Mapping) else X
 
@@ -164,17 +210,29 @@ def _transform(self, X, fitting):
         for x in X:
             for f, v in x.items():
                 if isinstance(v, str):
-                    f = "%s%s%s" % (f, self.separator, v)
+                    feature_name = "%s%s%s" % (f, self.separator, v)
                     v = 1
-                if f in vocab:
-                    indices.append(vocab[f])
-                    values.append(dtype(v))
-                else:
-                    if fitting:
-                        feature_names.append(f)
-                        vocab[f] = len(vocab)
-                        indices.append(vocab[f])
-                        values.append(dtype(v))
+                elif isinstance(v, Number) or (v is None):
+                    feature_name = f
+                elif isinstance(v, Mapping):
+                    raise TypeError(f'Unsupported value Type {type(v)} '
+                                    f'for {f}: {v}.\n'
+                                    'Mapping objects are not supported.')
+                elif isinstance(v, Iterable):
+                    feature_name = None
+                    self._add_iterable_element(f, v, feature_names, vocab,
+                                               fitting=fitting,
+                                               transforming=transforming,
+                                               indices=indices, values=values)
+
+                if feature_name is not None:
+                    if fitting and feature_name not in vocab:
+                        vocab[feature_name] = len(feature_names)
+                        feature_names.append(feature_name)
+
+                    if feature_name in vocab:
+                        indices.append(vocab[feature_name])
+                        values.append(self.dtype(v))
 
             indptr.append(len(indices))
 
@@ -218,6 +276,10 @@ def fit_transform(self, X, y=None):
         X : Mapping or iterable over Mappings
             Dict(s) or Mapping(s) from feature names (arbitrary Python
             objects) to feature values (strings or convertible to dtype).
+
+            .. versionchanged:: 0.24
+               Accepts multiple string values for one categorical feature.
+
         y : (ignored)
 
         Returns

diff --git a/sklearn/feature_extraction/tests/test_dict_vectorizer.py b/sklearn/feature_extraction/tests/test_dict_vectorizer.py
@@ -76,6 +76,52 @@ def test_one_of_k():
     assert "version" not in names
 
 
+def test_iterable_value():
+    D_names = ['ham', 'spam', 'version=1', 'version=2', 'version=3']
+    X_expected = [[2.0, 0.0, 2.0, 1.0, 0.0],
+                  [0.0, 0.3, 0.0, 1.0, 0.0],
+                  [0.0, -1.0, 0.0, 0.0, 1.0]]
+    D_in = [{"version": ["1", "2", "1"], "ham": 2},
+            {"version": "2", "spam": .3},
+            {"version=3": True, "spam": -1}]
+    v = DictVectorizer()
+    X = v.fit_transform(D_in)
+    X = X.toarray()
+    assert_array_equal(X, X_expected)
+
+    D_out = v.inverse_transform(X)
+    assert D_out[0] == {"version=1": 2, "version=2": 1, "ham": 2}
+
+    names = v.get_feature_names()
+
+    assert names == D_names
+
+
+def test_iterable_not_string_error():
+    error_value = ("Unsupported type <class 'int'> in iterable value. "
+                   "Only iterables of string are supported.")
+    D2 = [{'foo': '1', 'bar': '2'},
+          {'foo': '3', 'baz': '1'},
+          {'foo': [1, 'three']}]
+    v = DictVectorizer(sparse=False)
+    with pytest.raises(TypeError) as error:
+        v.fit(D2)
+    assert str(error.value) == error_value
+
+
+def test_mapping_error():
+    error_value = ("Unsupported value type <class 'dict'> "
+                   "for foo: {'one': 1, 'three': 3}.\n"
+                   "Mapping objects are not supported.")
+    D2 = [{'foo': '1', 'bar': '2'},
+          {'foo': '3', 'baz': '1'},
+          {'foo': {'one': 1, 'three': 3}}]
+    v = DictVectorizer(sparse=False)
+    with pytest.raises(TypeError) as error:
+        v.fit(D2)
+    assert str(error.value) == error_value
+
+
 def test_unseen_or_no_features():
     D = [{"camelot": 0, "spamalot": 1}]
     for sparse in [True, False]: