DOC Update plot_column_transformer to notebook style (#17028)

scikit-learn · May 19, 2020 · 6088842 · 6088842
1 parent c19aa54
commit 6088842
Showing 1 changed file with 100 additions and 71 deletions.
diff --git a/examples/compose/plot_column_transformer.py b/examples/compose/plot_column_transformer.py
@@ -3,24 +3,18 @@
 Column Transformer with Heterogeneous Data Sources
 ==================================================
 
-Datasets can often contain components of that require different feature
-extraction and processing pipelines.  This scenario might occur when:
+Datasets can often contain components that require different feature
+extraction and processing pipelines. This scenario might occur when:
 
-1. Your dataset consists of heterogeneous data types (e.g. raster images and
-   text captions)
-2. Your dataset is stored in a Pandas DataFrame and different columns
+1. your dataset consists of heterogeneous data types (e.g. raster images and
+   text captions),
+2. your dataset is stored in a :class:`pandas.DataFrame` and different columns
    require different processing pipelines.
 
 This example demonstrates how to use
-:class:`sklearn.compose.ColumnTransformer` on a dataset containing
-different types of features.  We use the 20-newsgroups dataset and compute
-standard bag-of-words features for the subject line and body in separate
-pipelines as well as ad hoc features on the body. We combine them (with
-weights) using a ColumnTransformer and finally train a classifier on the
-combined set of features.
-
-The choice of features is not particularly helpful, but serves to illustrate
-the technique.
+:class:`~sklearn.compose.ColumnTransformer` on a dataset containing
+different types of features. The choice of features is not particularly
+helpful, but serves to illustrate the technique.
 """
 
 # Author: Matt Terry <matt.terry@gmail.com>
@@ -29,7 +23,7 @@
 
 import numpy as np
 
-from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.preprocessing import FunctionTransformer
 from sklearn.datasets import fetch_20newsgroups
 from sklearn.decomposition import TruncatedSVD
 from sklearn.feature_extraction import DictVectorizer
@@ -39,95 +33,130 @@
 from sklearn.compose import ColumnTransformer
 from sklearn.svm import LinearSVC
 
+##############################################################################
+# 20 newsgroups dataset
+# ---------------------
+#
+# We will use the :ref:`20 newsgroups dataset <20newsgroups_dataset>`, which
+# comprises posts from newsgroups on 20 topics. This dataset is split
+# into train and test subsets based on messages posted before and after
+# a specific date. We will only use posts from 2 categories to speed up running
+# time.
 
-class TextStats(TransformerMixin, BaseEstimator):
-    """Extract features from each document for DictVectorizer"""
+categories = ['sci.med', 'sci.space']
+X_train, y_train = fetch_20newsgroups(random_state=1,
+                                      subset='train',
+                                      categories=categories,
+                                      remove=('footers', 'quotes'),
+                                      return_X_y=True)
+X_test, y_test = fetch_20newsgroups(random_state=1,
+                                    subset='test',
+                                    categories=categories,
+                                    remove=('footers', 'quotes'),
+                                    return_X_y=True)
 
-    def fit(self, x, y=None):
-        return self
+##############################################################################
+# Each feature comprises meta information about that post, such as the subject,
+# and the body of the news post.
 
-    def transform(self, posts):
-        return [{'length': len(text),
-                 'num_sentences': text.count('.')}
-                for text in posts]
+print(X_train[0])
 
+##############################################################################
+# Creating transformers
+# ---------------------
+#
+# First, we would like a transformer that extracts the subject and
+# body of each post. Since this is a stateless transformation (does not
+# require state information from training data), we can define a function that
+# performs the data transformation then use
+# :class:`~sklearn.preprocessing.FunctionTransformer` to create a scikit-learn
+# transformer.
 
-class SubjectBodyExtractor(TransformerMixin, BaseEstimator):
-    """Extract the subject & body from a usenet post in a single pass.
 
-    Takes a sequence of strings and produces a dict of sequences.  Keys are
-    `subject` and `body`.
-    """
-    def fit(self, x, y=None):
-        return self
+def subject_body_extractor(posts):
+    # construct object dtype array with two columns
+    # first column = 'subject' and second column = 'body'
+    features = np.empty(shape=(len(posts), 2), dtype=object)
+    for i, text in enumerate(posts):
+        # temporary variable `_` stores '\n\n'
+        headers, _, body = text.partition('\n\n')
+        # store body text in second column
+        features[i, 1] = body
 
-    def transform(self, posts):
-        # construct object dtype array with two columns
-        # first column = 'subject' and second column = 'body'
-        features = np.empty(shape=(len(posts), 2), dtype=object)
-        for i, text in enumerate(posts):
-            headers, _, bod = text.partition('\n\n')
-            features[i, 1] = bod
+        prefix = 'Subject:'
+        sub = ''
+        # save text after 'Subject:' in first column
+        for line in headers.split('\n'):
+            if line.startswith(prefix):
+                sub = line[len(prefix):]
+                break
+        features[i, 0] = sub
 
-            prefix = 'Subject:'
-            sub = ''
-            for line in headers.split('\n'):
-                if line.startswith(prefix):
-                    sub = line[len(prefix):]
-                    break
-            features[i, 0] = sub
+    return features
 
-        return features
 
+subject_body_transformer = FunctionTransformer(subject_body_extractor)
 
-pipeline = Pipeline([
-    # Extract the subject & body
-    ('subjectbody', SubjectBodyExtractor()),
+##############################################################################
+# We will also create a transformer that extracts the
+# length of the text and the number of sentences.
+
+
+def text_stats(posts):
+    return [{'length': len(text),
+             'num_sentences': text.count('.')}
+            for text in posts]
+
+
+text_stats_transformer = FunctionTransformer(text_stats)
 
-    # Use ColumnTransformer to combine the features from subject and body
+##############################################################################
+# Classification pipeline
+# -----------------------
+#
+# The pipeline below extracts the subject and body from each post using
+# ``SubjectBodyExtractor``, producing a (n_samples, 2) array. This array is
+# then used to compute standard bag-of-words features for the subject and body
+# as well as text length and number of sentences on the body, using
+# ``ColumnTransformer``. We combine them, with weights, then train a
+# classifier on the combined set of features.
+
+pipeline = Pipeline([
+    # Extract subject & body
+    ('subjectbody', subject_body_transformer),
+    # Use ColumnTransformer to combine the subject and body features
     ('union', ColumnTransformer(
         [
-            # Pulling features from the post's subject line (first column)
+            # bag-of-words for subject (col 0)
             ('subject', TfidfVectorizer(min_df=50), 0),
-
-            # Pipeline for standard bag-of-words model for body (second column)
+            # bag-of-words with decomposition for body (col 1)
             ('body_bow', Pipeline([
                 ('tfidf', TfidfVectorizer()),
                 ('best', TruncatedSVD(n_components=50)),
             ]), 1),
-
-            # Pipeline for pulling ad hoc features from post's body
+            # Pipeline for pulling text stats from post's body
             ('body_stats', Pipeline([
-                ('stats', TextStats()),  # returns a list of dicts
+                ('stats', text_stats_transformer),  # returns a list of dicts
                 ('vect', DictVectorizer()),  # list of dicts -> feature matrix
             ]), 1),
         ],
-
-        # weight components in ColumnTransformer
+        # weight above ColumnTransformer features
         transformer_weights={
             'subject': 0.8,
             'body_bow': 0.5,
             'body_stats': 1.0,
         }
     )),
-
     # Use a SVC classifier on the combined features
     ('svc', LinearSVC(dual=False)),
 ], verbose=True)
 
-# limit the list of categories to make running this example faster.
-categories = ['alt.atheism', 'talk.religion.misc']
-X_train, y_train = fetch_20newsgroups(random_state=1,
-                                      subset='train',
-                                      categories=categories,
-                                      remove=('footers', 'quotes'),
-                                      return_X_y=True)
-X_test, y_test = fetch_20newsgroups(random_state=1,
-                                    subset='test',
-                                    categories=categories,
-                                    remove=('footers', 'quotes'),
-                                    return_X_y=True)
+##############################################################################
+# Finally, we fit our pipeline on the training data and use it to predict
+# topics for ``X_test``. Performance metrics of our pipeline are then printed.
 
 pipeline.fit(X_train, y_train)
 y_pred = pipeline.predict(X_test)
-print(classification_report(y_test, y_pred))
+print('Classification report:\n\n{}'.format(
+    classification_report(y_test, y_pred))
+)