scikit-learn · jnothman · May 29, 2018 · Jun 5, 2015 · Jun 6, 2017 · Jun 6, 2017
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
@@ -1381,6 +1381,30 @@ Low-level methods
    utils.validation.column_or_1d
    utils.validation.has_fit_parameter
 
+
+.. _experimental_ref:
+
+:mod:`sklearn.experimental`: Experimental functionality
+=======================================================
+
+.. automodule:: sklearn.experimental
+   :no-members:
+   :no-inherited-members:
+
+.. currentmodule:: sklearn
+
+.. autosummary::
+   :toctree: generated/
+   :template: class.rst
+
+   experimental.ColumnTransformer
+
+.. autosummary::
+   :toctree: generated/
+   :template: function.rst
+
+   experimental.make_column_transformer
+
 Recently deprecated
 ===================
 

diff --git a/doc/modules/feature_extraction.rst b/doc/modules/feature_extraction.rst
@@ -101,6 +101,91 @@ memory the ``DictVectorizer`` class uses a ``scipy.sparse`` matrix by
 default instead of a ``numpy.ndarray``.
 
 
+.. _column_transformer:
+
+ColumnTransformer for heterogeneous data
+========================================
+
+.. warning::
+
+    The :class:`experimental.ColumnTransformer <sklearn.experimental.ColumnTransformer>`
+    class is experimental and the API is subject to change.
+
+Many datasets contain features of different types, say text, floats, and dates,
+where each type of feature requires separate preprocessing or feature
+extraction steps.  Often it is easiest to preprocess data before applying
+scikit-learn methods, for example using `pandas <http://pandas.pydata.org/>`__.
+Processing your data before passing it to scikit-learn might be problematic for
+one of the following reasons:
+
+1. Incorporating statistics from test data into the preprocessors makes
+   cross-validation scores unreliable (known as *data leakage*).
+2. You may want to include the parameters of the preprocessors in a
+   :ref:`parameter search <grid_search>`.
+
+:class:`~sklearn.experimental.ColumnTransformer` helps performing different
+transformations for different columns of the data, within a
+:class:`~sklearn.pipeline.Pipeline` that is safe from data leakage and that can
+be parametrized. :class:`~sklearn.experimental.ColumnTransformer` works on
+arrays, sparse matrices, and
+`pandas DataFrames <http://pandas.pydata.org/pandas-docs/stable/>`__.
+
+To each column, a different transformation can be applied, such as
+preprocessing or a specific feature extraction method::
+
+  >>> import pandas as pd
+  >>> X = pd.DataFrame(
+  ...     {'city': ['London', 'London', 'Paris', 'Sallisaw'],
+  ...      'title': ["His Last Bow", "How Watson Learned the Trick",
+  ...                "A Moveable Feast", "The Grapes of Wrath"]})
+
+For this data, we might want to encode the ``'city'`` column as a categorical
+variable, but apply a :class:`feature_extraction.text.CountVectorizer
+<sklearn.feature_extraction.text.CountVectorizer>` to the ``'title'`` column.
+As we might use multiple feature extraction methods on the same column, we give
+each transformer a unique name, say ``'city_category'`` and ``'title_bow'``::
+
+  >>> from sklearn.experimental import ColumnTransformer
+  >>> from sklearn.feature_extraction.text import CountVectorizer
+  >>> column_trans = ColumnTransformer(
+  ...     [('city_category', CountVectorizer(analyzer=lambda x: [x]), 'city'),
+  ...      ('title_bow', CountVectorizer(), 'title')])
+
+  >>> column_trans.fit(X) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
+  ColumnTransformer(n_jobs=1, transformer_weights=None,
+      transformers=...)
+
+  >>> column_trans.get_feature_names()
+  ... # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
+  ['city_category__London', 'city_category__Paris', 'city_category__Sallisaw',
+  'title_bow__bow', 'title_bow__feast', 'title_bow__grapes', 'title_bow__his',
+  'title_bow__how', 'title_bow__last', 'title_bow__learned', 'title_bow__moveable',
+  'title_bow__of', 'title_bow__the', 'title_bow__trick', 'title_bow__watson',
+  'title_bow__wrath']
+
+  >>> column_trans.transform(X).toarray()
+  ... # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
+  array([[1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0],
+         [1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0],
+         [0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
+         [0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1]]...)
+
+In the above example, the
+:class:`~sklearn.feature_extraction.text.CountVectorizer` expects a 1D array as
+input and therefore the columns were specified as a string (``'city'``).
+However, other transformers generally expect 2D data, and in that case you need
+to specify the column as a list of strings (``['city']``).
+
+Apart from a scalar or a single item list, the column selection can be specified
+as a list of multiple items, an integer array, a slice, or a boolean mask.
+Strings can reference columns if the input is a DataFrame, integers are always
+interpreted as the positional columns.
+
+.. topic:: Examples:
+
+ * :ref:`sphx_glr_auto_examples_column_transformer.py`
+
+
 .. _feature_hashing:
 
 Feature hashing
@@ -916,7 +1001,7 @@ Some tips and tricks:
     (Note that this will not filter out punctuation.)
 
 
-    The following example will, for instance, transform some British spelling 
+    The following example will, for instance, transform some British spelling
     to American spelling::
 
         >>> import re

diff --git a/doc/modules/feature_extraction_fixture.py b/doc/modules/feature_extraction_fixture.py
@@ -0,0 +1,12 @@
+"""Fixture module to skip the feature_extraction docs when pandas is not
+installed
+
+"""
+from sklearn.utils.testing import SkipTest
+
+
+def setup(module):
+    try:
+        import pandas  # noqa
+    except ImportError:
+        raise SkipTest("pandas not installed")
diff --git a/doc/modules/pipeline.rst b/doc/modules/pipeline.rst
@@ -220,9 +220,13 @@ FeatureUnion: composite feature spaces
 :class:`FeatureUnion` combines several transformer objects into a new
 transformer that combines their output. A :class:`FeatureUnion` takes
 a list of transformer objects. During fitting, each of these
-is fit to the data independently. For transforming data, the
-transformers are applied in parallel, and the sample vectors they output
-are concatenated end-to-end into larger vectors.
+is fit to the data independently. The transformers are applied in parallel,
+and the feature matrices they output are concatenated side-by-side into a
+larger matrix.
+
+When you want to apply different transformations to each field of the data,
+see the related class :class:`sklearn.experimental.ColumnTransformer`
+(see :ref:`user guide <column_transformer>`).
 
 :class:`FeatureUnion` serves the same purposes as :class:`Pipeline` -
 convenience and joint parameter estimation and validation.
@@ -272,5 +276,4 @@ and ignored by setting to ``None``::
 
 .. topic:: Examples:
 
- * :ref:`sphx_glr_auto_examples_plot_feature_stacker.py`
- * :ref:`sphx_glr_auto_examples_hetero_feature_union.py`
+ * :ref:`sphx_glr_auto_examples_feature_stacker.py`
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -158,6 +158,36 @@ Other estimators
 
 Model selection and evaluation
 
+   - :class:`model_selection.GridSearchCV` and
+     :class:`model_selection.RandomizedSearchCV` now support simultaneous
+     evaluation of multiple metrics. Refer to the
+     :ref:`multimetric_grid_search` section of the user guide for more
+     information. :issue:`7388` by `Raghav RV`_
+
+   - Added the :func:`model_selection.cross_validate` which allows evaluation
+     of multiple metrics. This function returns a dict with more useful
+     information from cross-validation such as the train scores, fit times and
+     score times.
+     Refer to :ref:`multimetric_cross_validation` section of the userguide
+     for more information. :issue:`7388` by `Raghav RV`_
+
+   - Added :func:`metrics.mean_squared_log_error`, which computes
+     the mean square error of the logarithmic transformation of targets,
+     particularly useful for targets with an exponential trend.
+     :issue:`7655` by :user:`Karan Desai <karandesai-96>`.
+
+   - Added :class:`experimental.ColumnTransformer`, which allows to apply
+     different transformers to different columns of arrays or or pandas
+     dataframes. By `Andreas Müller`_ and `Joris Van den Bossche`_.
+
+   - Added :func:`metrics.dcg_score` and :func:`metrics.ndcg_score`, which
+     compute Discounted cumulative gain (DCG) and Normalized discounted
+     cumulative gain (NDCG).
+     :issue:`7739` by :user:`David Gasquez <davidgasquez>`.
+
+   - Added the :class:`model_selection.RepeatedKFold` and
+     :class:`model_selection.RepeatedStratifiedKFold`.
+     :issue:`8120` by `Neeraj Gangwar`_.
 - :class:`model_selection.GridSearchCV` and
   :class:`model_selection.RandomizedSearchCV` now support simultaneous
   evaluation of multiple metrics. Refer to the
@@ -5741,7 +5771,11 @@ David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson.
 .. _Vincent Pham: https://github.com/vincentpham1991
 
 .. _Denis Engemann: http://denis-engemann.de
+
 .. _Anish Shah: https://github.com/AnishShah
 
 .. _Neeraj Gangwar: http://neerajgangwar.in
+
 .. _Arthur Mensch: https://amensch.fr
+
+.. _Joris Van den Bossche: https://github.com/jorisvandenbossche
diff --git a/examples/hetero_feature_union.py → examples/column_transformer.py b/examples/hetero_feature_union.py → examples/column_transformer.py
@@ -12,12 +12,12 @@
    require different processing pipelines.
 
 This example demonstrates how to use
-:class:`sklearn.feature_extraction.FeatureUnion` on a dataset containing
+:class:`sklearn.experimental.ColumnTransformer` on a dataset containing
 different types of features.  We use the 20-newsgroups dataset and compute
 standard bag-of-words features for the subject line and body in separate
 pipelines as well as ad hoc features on the body. We combine them (with
-weights) using a FeatureUnion and finally train a classifier on the combined
-set of features.
+weights) using a ColumnTransformer and finally train a classifier on the
+combined set of features.
 
 The choice of features is not particularly helpful, but serves to illustrate
 the technique.
@@ -38,50 +38,11 @@
 from sklearn.feature_extraction import DictVectorizer
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics import classification_report
-from sklearn.pipeline import FeatureUnion
 from sklearn.pipeline import Pipeline
+from sklearn.experimental import ColumnTransformer
 from sklearn.svm import SVC
 
 
-class ItemSelector(BaseEstimator, TransformerMixin):
-    """For data grouped by feature, select subset of data at a provided key.
-
-    The data is expected to be stored in a 2D data structure, where the first
-    index is over features and the second is over samples.  i.e.
-
-    >> len(data[key]) == n_samples
-
-    Please note that this is the opposite convention to scikit-learn feature
-    matrixes (where the first index corresponds to sample).
-
-    ItemSelector only requires that the collection implement getitem
-    (data[key]).  Examples include: a dict of lists, 2D numpy array, Pandas
-    DataFrame, numpy record array, etc.
-
-    >> data = {'a': [1, 5, 2, 5, 2, 8],
-               'b': [9, 4, 1, 4, 1, 3]}
-    >> ds = ItemSelector(key='a')
-    >> data['a'] == ds.transform(data)
-
-    ItemSelector is not designed to handle data grouped by sample.  (e.g. a
-    list of dicts).  If your data is structured this way, consider a
-    transformer along the lines of `sklearn.feature_extraction.DictVectorizer`.
-
-    Parameters
-    ----------
-    key : hashable, required
-        The key corresponding to the desired value in a mappable.
-    """
-    def __init__(self, key):
-        self.key = key
-
-    def fit(self, x, y=None):
-        return self
-
-    def transform(self, data_dict):
-        return data_dict[self.key]
-
-
 class TextStats(BaseEstimator, TransformerMixin):
     """Extract features from each document for DictVectorizer"""
 
@@ -104,21 +65,22 @@ def fit(self, x, y=None):
         return self
 
     def transform(self, posts):
-        features = np.recarray(shape=(len(posts),),
-                               dtype=[('subject', object), ('body', object)])
+        # construct object dtype array with two columns
+        # first column = 'subject' and second column = 'body'
+        features = np.empty(shape=(len(posts), 2), dtype=object)
         for i, text in enumerate(posts):
             headers, _, bod = text.partition('\n\n')
             bod = strip_newsgroup_footer(bod)
             bod = strip_newsgroup_quoting(bod)
-            features['body'][i] = bod
+            features[i, 1] = bod
 
             prefix = 'Subject:'
             sub = ''
             for line in headers.split('\n'):
                 if line.startswith(prefix):
                     sub = line[len(prefix):]
                     break
-            features['subject'][i] = sub
+            features[i, 0] = sub
 
         return features
 
@@ -128,37 +90,30 @@ def transform(self, posts):
     ('subjectbody', SubjectBodyExtractor()),
 
     # Use FeatureUnion to combine the features from subject and body
-    ('union', FeatureUnion(
-        transformer_list=[
+    ('union', ColumnTransformer(
+        [
+            # Pulling features from the post's subject line (first column)
+            ('subject', TfidfVectorizer(min_df=50), 0),
 
-            # Pipeline for pulling features from the post's subject line
-            ('subject', Pipeline([
-                ('selector', ItemSelector(key='subject')),
-                ('tfidf', TfidfVectorizer(min_df=50)),
-            ])),
-
-            # Pipeline for standard bag-of-words model for body
+            # Pipeline for standard bag-of-words model for body (second column)
             ('body_bow', Pipeline([
-                ('selector', ItemSelector(key='body')),
                 ('tfidf', TfidfVectorizer()),
                 ('best', TruncatedSVD(n_components=50)),
-            ])),
+            ]), 1),
 
             # Pipeline for pulling ad hoc features from post's body
             ('body_stats', Pipeline([
-                ('selector', ItemSelector(key='body')),
                 ('stats', TextStats()),  # returns a list of dicts
                 ('vect', DictVectorizer()),  # list of dicts -> feature matrix
-            ])),
-
+            ]), 1),
         ],
 
         # weight components in FeatureUnion
         transformer_weights={
             'subject': 0.8,
             'body_bow': 0.5,
             'body_stats': 1.0,
-        },
+        }
     )),
 
     # Use a SVC classifier on the combined features

diff --git a/sklearn/__init__.py b/sklearn/__init__.py
@@ -135,15 +135,16 @@ def config_context(**new_config):
     __check_build  # avoid flakes unused variable error
 
     __all__ = ['calibration', 'cluster', 'covariance', 'cross_decomposition',
-               'cross_validation', 'datasets', 'decomposition', 'dummy',
-               'ensemble', 'exceptions', 'externals', 'feature_extraction',
+               'cross_validation', 'datasets', 'decomposition',
+               'discriminant_analysis', 'dummy', 'ensemble', 'exceptions',
+               'experimental', 'externals', 'feature_extraction',
                'feature_selection', 'gaussian_process', 'grid_search',
                'isotonic', 'kernel_approximation', 'kernel_ridge',
                'learning_curve', 'linear_model', 'manifold', 'metrics',
                'mixture', 'model_selection', 'multiclass', 'multioutput',
                'naive_bayes', 'neighbors', 'neural_network', 'pipeline',
                'preprocessing', 'random_projection', 'semi_supervised',
-               'svm', 'tree', 'discriminant_analysis',
+               'svm', 'tree',
                # Non-modules:
                'clone']
 

diff --git a/sklearn/experimental/__init__.py b/sklearn/experimental/__init__.py
@@ -0,0 +1,9 @@
+"""
+The :mod:`sklearn.experimental` module hosts experimental functionality for
+which the API is not yet guaranteed to be stable.
+"""
+
+from ._column_transformer import ColumnTransformer, make_column_transformer
+
+
+__all__ = ['ColumnTransformer', 'make_column_transformer']