Skip to content

Commit

Permalink
DOC Update plot_column_transformer to notebook style (#17028)
Browse files Browse the repository at this point in the history
  • Loading branch information
lucyleeow authored and adrinjalali committed May 19, 2020
1 parent c19aa54 commit 6088842
Showing 1 changed file with 100 additions and 71 deletions.
171 changes: 100 additions & 71 deletions examples/compose/plot_column_transformer.py
Expand Up @@ -3,24 +3,18 @@
Column Transformer with Heterogeneous Data Sources
==================================================
Datasets can often contain components of that require different feature
extraction and processing pipelines. This scenario might occur when:
Datasets can often contain components that require different feature
extraction and processing pipelines. This scenario might occur when:
1. Your dataset consists of heterogeneous data types (e.g. raster images and
text captions)
2. Your dataset is stored in a Pandas DataFrame and different columns
1. your dataset consists of heterogeneous data types (e.g. raster images and
text captions),
2. your dataset is stored in a :class:`pandas.DataFrame` and different columns
require different processing pipelines.
This example demonstrates how to use
:class:`sklearn.compose.ColumnTransformer` on a dataset containing
different types of features. We use the 20-newsgroups dataset and compute
standard bag-of-words features for the subject line and body in separate
pipelines as well as ad hoc features on the body. We combine them (with
weights) using a ColumnTransformer and finally train a classifier on the
combined set of features.
The choice of features is not particularly helpful, but serves to illustrate
the technique.
:class:`~sklearn.compose.ColumnTransformer` on a dataset containing
different types of features. The choice of features is not particularly
helpful, but serves to illustrate the technique.
"""

# Author: Matt Terry <matt.terry@gmail.com>
Expand All @@ -29,7 +23,7 @@

import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction import DictVectorizer
Expand All @@ -39,95 +33,130 @@
from sklearn.compose import ColumnTransformer
from sklearn.svm import LinearSVC

##############################################################################
# 20 newsgroups dataset
# ---------------------
#
# We will use the :ref:`20 newsgroups dataset <20newsgroups_dataset>`, which
# comprises posts from newsgroups on 20 topics. This dataset is split
# into train and test subsets based on messages posted before and after
# a specific date. We will only use posts from 2 categories to speed up running
# time.

class TextStats(TransformerMixin, BaseEstimator):
"""Extract features from each document for DictVectorizer"""
categories = ['sci.med', 'sci.space']
X_train, y_train = fetch_20newsgroups(random_state=1,
subset='train',
categories=categories,
remove=('footers', 'quotes'),
return_X_y=True)
X_test, y_test = fetch_20newsgroups(random_state=1,
subset='test',
categories=categories,
remove=('footers', 'quotes'),
return_X_y=True)

def fit(self, x, y=None):
return self
##############################################################################
# Each feature comprises meta information about that post, such as the subject,
# and the body of the news post.

def transform(self, posts):
return [{'length': len(text),
'num_sentences': text.count('.')}
for text in posts]
print(X_train[0])

##############################################################################
# Creating transformers
# ---------------------
#
# First, we would like a transformer that extracts the subject and
# body of each post. Since this is a stateless transformation (does not
# require state information from training data), we can define a function that
# performs the data transformation then use
# :class:`~sklearn.preprocessing.FunctionTransformer` to create a scikit-learn
# transformer.

class SubjectBodyExtractor(TransformerMixin, BaseEstimator):
"""Extract the subject & body from a usenet post in a single pass.

Takes a sequence of strings and produces a dict of sequences. Keys are
`subject` and `body`.
"""
def fit(self, x, y=None):
return self
def subject_body_extractor(posts):
# construct object dtype array with two columns
# first column = 'subject' and second column = 'body'
features = np.empty(shape=(len(posts), 2), dtype=object)
for i, text in enumerate(posts):
# temporary variable `_` stores '\n\n'
headers, _, body = text.partition('\n\n')
# store body text in second column
features[i, 1] = body

def transform(self, posts):
# construct object dtype array with two columns
# first column = 'subject' and second column = 'body'
features = np.empty(shape=(len(posts), 2), dtype=object)
for i, text in enumerate(posts):
headers, _, bod = text.partition('\n\n')
features[i, 1] = bod
prefix = 'Subject:'
sub = ''
# save text after 'Subject:' in first column
for line in headers.split('\n'):
if line.startswith(prefix):
sub = line[len(prefix):]
break
features[i, 0] = sub

prefix = 'Subject:'
sub = ''
for line in headers.split('\n'):
if line.startswith(prefix):
sub = line[len(prefix):]
break
features[i, 0] = sub
return features

return features

subject_body_transformer = FunctionTransformer(subject_body_extractor)

pipeline = Pipeline([
# Extract the subject & body
('subjectbody', SubjectBodyExtractor()),
##############################################################################
# We will also create a transformer that extracts the
# length of the text and the number of sentences.


def text_stats(posts):
return [{'length': len(text),
'num_sentences': text.count('.')}
for text in posts]


text_stats_transformer = FunctionTransformer(text_stats)

# Use ColumnTransformer to combine the features from subject and body
##############################################################################
# Classification pipeline
# -----------------------
#
# The pipeline below extracts the subject and body from each post using
# ``SubjectBodyExtractor``, producing a (n_samples, 2) array. This array is
# then used to compute standard bag-of-words features for the subject and body
# as well as text length and number of sentences on the body, using
# ``ColumnTransformer``. We combine them, with weights, then train a
# classifier on the combined set of features.

pipeline = Pipeline([
# Extract subject & body
('subjectbody', subject_body_transformer),
# Use ColumnTransformer to combine the subject and body features
('union', ColumnTransformer(
[
# Pulling features from the post's subject line (first column)
# bag-of-words for subject (col 0)
('subject', TfidfVectorizer(min_df=50), 0),

# Pipeline for standard bag-of-words model for body (second column)
# bag-of-words with decomposition for body (col 1)
('body_bow', Pipeline([
('tfidf', TfidfVectorizer()),
('best', TruncatedSVD(n_components=50)),
]), 1),

# Pipeline for pulling ad hoc features from post's body
# Pipeline for pulling text stats from post's body
('body_stats', Pipeline([
('stats', TextStats()), # returns a list of dicts
('stats', text_stats_transformer), # returns a list of dicts
('vect', DictVectorizer()), # list of dicts -> feature matrix
]), 1),
],

# weight components in ColumnTransformer
# weight above ColumnTransformer features
transformer_weights={
'subject': 0.8,
'body_bow': 0.5,
'body_stats': 1.0,
}
)),

# Use a SVC classifier on the combined features
('svc', LinearSVC(dual=False)),
], verbose=True)

# limit the list of categories to make running this example faster.
categories = ['alt.atheism', 'talk.religion.misc']
X_train, y_train = fetch_20newsgroups(random_state=1,
subset='train',
categories=categories,
remove=('footers', 'quotes'),
return_X_y=True)
X_test, y_test = fetch_20newsgroups(random_state=1,
subset='test',
categories=categories,
remove=('footers', 'quotes'),
return_X_y=True)
##############################################################################
# Finally, we fit our pipeline on the training data and use it to predict
# topics for ``X_test``. Performance metrics of our pipeline are then printed.

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))
print('Classification report:\n\n{}'.format(
classification_report(y_test, y_pred))
)

0 comments on commit 6088842

Please sign in to comment.