Skip to content

Commit

Permalink
Merge pull request #73 from paulgb/jph00-dfout
Browse files Browse the repository at this point in the history
Add a df_out option to return a dataframe
  • Loading branch information
dukebody committed Jan 17, 2017
2 parents 10a43e4 + 4a0f16c commit 4a63261
Show file tree
Hide file tree
Showing 3 changed files with 148 additions and 6 deletions.
35 changes: 33 additions & 2 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,29 @@ Now that the transformation is trained, we confirm that it works on new data::
>>> np.round(mapper.transform(sample), 2)
array([[ 1. , 0. , 0. , 1.04]])


Outputting a dataframe
**********************

By default the output of the dataframe mapper is a numpy array. This is so because most sklearn estimators expect a numpy array as input. If however we want the output of the mapper to be a dataframe, we can do so using the parameter ``df_out`` when creating the mapper::

>>> mapper_df = DataFrameMapper([
... ('pet', sklearn.preprocessing.LabelBinarizer()),
... (['children'], sklearn.preprocessing.StandardScaler())
... ], df_out=True)
>>> np.round(mapper_df.fit_transform(data.copy()), 2)
pet_cat pet_dog pet_fish children
0 1.0 0.0 0.0 0.21
1 0.0 1.0 0.0 1.88
2 0.0 1.0 0.0 -0.63
3 0.0 0.0 1.0 -0.63
4 1.0 0.0 0.0 -1.46
5 0.0 1.0 0.0 -0.63
6 1.0 0.0 0.0 1.04
7 0.0 0.0 1.0 0.21

Note this does not work together with the ``default=True`` or ``sparse=True`` arguments to the mapper.

Transform Multiple Columns
**************************

Expand Down Expand Up @@ -229,6 +252,13 @@ Sklearn-pandas' ``cross_val_score`` function provides exactly the same interface
Changelog
---------

Development
***********

* Make the mapper return dataframes when ``df_out=True`` (#70).
* Update imports to avoid deprecation warnings in sklearn 0.18 (#68).


1.2.0 (2016-10-02)
******************

Expand Down Expand Up @@ -272,9 +302,10 @@ The code for ``DataFrameMapper`` is based on code originally written by `Ben Ham

Other contributors:

* Paul Butler
* Cal Paterson
* Israel Saeta Pérez
* Zac Stewart
* Jeremy Howard
* Olivier Grisel
* Paul Butler
* Vitaley Zaretskey
* Zac Stewart
36 changes: 33 additions & 3 deletions sklearn_pandas/dataframe_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ class DataFrameMapper(BaseEstimator, TransformerMixin):
sklearn transformation.
"""

def __init__(self, features, default=False, sparse=False):
def __init__(self, features, default=False, sparse=False, df_out=False):
"""
Params:
Expand All @@ -50,13 +50,23 @@ def __init__(self, features, default=False, sparse=False):
sparse will return sparse matrix if set True and any of the
extracted features is sparse. Defaults to False.
df_out return a pandas data frame, with each column named using
the pandas column that created it (if there's only one
input and output) or the input columns joined with '_'
if there's multiple inputs, and the name concatenated with
'_1', '_2' etc if there's multiple outputs. NB: does not
work if *default* or *sparse* are true
"""
if isinstance(features, list):
features = [(columns, _build_transformer(transformers))
for (columns, transformers) in features]
self.features = features
self.default = _build_transformer(default)
self.sparse = sparse
self.df_out = df_out
if (df_out and (sparse or default)):
raise ValueError("Can not use df_out with sparse or default")

@property
def _selected_columns(self):
Expand Down Expand Up @@ -94,6 +104,7 @@ def __setstate__(self, state):
# compatibility shim for pickles created before ``default`` init
# argument existed
self.default = state.get('default', False)
self.df_out = state.get('df_out', False)

def _get_col_subset(self, X, cols):
"""
Expand Down Expand Up @@ -145,13 +156,26 @@ def fit(self, X, y=None):
self._get_col_subset(X, self._unselected_columns(X)), y)
return self


def get_names(self, c, t, x):
if type(c)==list:
c = '_'.join(c)
if hasattr(t, 'classes_') and (len(t.classes_)>2):
return [c + '_' + o for o in t.classes_]
elif len(x.shape)>1 and x.shape[1]>1:
return [c + '_' + str(o) for o in range(x.shape[1])]
else:
return [c]


def transform(self, X):
"""
Transform the given data. Assumes that fit has already been called.
X the data to transform
"""
extracted = []
index = []
for columns, transformers in self.features:
# columns could be a string or list of
# strings; we don't care because pandas
Expand All @@ -160,10 +184,13 @@ def transform(self, X):
if transformers is not None:
Xt = transformers.transform(Xt)
extracted.append(_handle_feature(Xt))
if self.df_out:
index = index + self.get_names(columns, transformers, Xt)

# handle features not explicitly selected
if self.default is not False:
Xt = self._get_col_subset(X, self._unselected_columns(X))
unsel_cols = self._unselected_columns(X)
Xt = self._get_col_subset(X, unsel_cols)
if self.default is not None:
Xt = self.default.transform(Xt)
extracted.append(_handle_feature(Xt))
Expand All @@ -185,4 +212,7 @@ def transform(self, X):
else:
stacked = np.hstack(extracted)

return stacked
if not self.df_out:
return stacked

return pd.DataFrame(stacked, columns=index)
83 changes: 82 additions & 1 deletion tests/test_dataframe_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,11 @@
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Imputer, StandardScaler, OneHotEncoder
from sklearn.preprocessing import (
Imputer, StandardScaler, OneHotEncoder, LabelBinarizer)
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.base import BaseEstimator, TransformerMixin
import sklearn.decomposition
import numpy as np
from numpy.testing import assert_array_equal
import pickle
Expand Down Expand Up @@ -77,6 +79,85 @@ def complex_dataframe():
'feat2': [1, 2, 3, 2, 3, 4]})


def test_simple_df(simple_dataframe):
"""
Get a dataframe from a simple mapped dataframe
"""
df = simple_dataframe
mapper = DataFrameMapper([('a', None)], df_out=True)
transformed = mapper.fit_transform(df)
assert type(transformed) == pd.DataFrame
assert len(transformed["a"]) == len(simple_dataframe["a"])


def test_complex_df(complex_dataframe):
"""
Get a dataframe from a complex mapped dataframe
"""
df = complex_dataframe
mapper = DataFrameMapper(
[('target', None), ('feat1', None), ('feat2', None)],
df_out=True)
transformed = mapper.fit_transform(df)
assert len(transformed) == len(complex_dataframe)
for c in df.columns:
assert len(transformed[c]) == len(df[c])


def test_binarizer_df():
"""
Check level names from LabelBinarizer
"""
df = pd.DataFrame({'target': ['a', 'a', 'b', 'b', 'c', 'a']})
mapper = DataFrameMapper([('target', LabelBinarizer())], df_out=True)
transformed = mapper.fit_transform(df)
cols = transformed.columns
assert len(cols) == 3
assert cols[0] == 'target_a'
assert cols[1] == 'target_b'
assert cols[2] == 'target_c'


def test_binarizer2_df():
"""
Check level names from LabelBinarizer with just one output column
"""
df = pd.DataFrame({'target': ['a', 'a', 'b', 'b', 'a']})
mapper = DataFrameMapper([('target', LabelBinarizer())], df_out=True)
transformed = mapper.fit_transform(df)
cols = transformed.columns
assert len(cols) == 1
assert cols[0] == 'target'


def test_onehot_df():
"""
Check level ids from one-hot
"""
df = pd.DataFrame({'target': [0, 0, 1, 1, 2, 3, 0]})
mapper = DataFrameMapper([(['target'], OneHotEncoder())], df_out=True)
transformed = mapper.fit_transform(df)
cols = transformed.columns
assert len(cols) == 4
assert cols[0] == 'target_0'
assert cols[3] == 'target_3'


def test_pca(complex_dataframe):
"""
Check multi in and out with PCA
"""
df = complex_dataframe
mapper = DataFrameMapper(
[(['feat1', 'feat2'], sklearn.decomposition.PCA(2))],
df_out=True)
transformed = mapper.fit_transform(df)
cols = transformed.columns
assert len(cols) == 2
assert cols[0] == 'feat1_feat2_0'
assert cols[1] == 'feat1_feat2_1'


def test_nonexistent_columns_explicit_fail(simple_dataframe):
"""
If a nonexistent column is selected, KeyError is raised.
Expand Down

0 comments on commit 4a63261

Please sign in to comment.