Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a df_out option to return a dataframe #73

Merged
merged 4 commits into from
Jan 17, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
35 changes: 33 additions & 2 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,29 @@ Now that the transformation is trained, we confirm that it works on new data::
>>> np.round(mapper.transform(sample), 2)
array([[ 1. , 0. , 0. , 1.04]])


Outputting a dataframe
**********************

By default the output of the dataframe mapper is a numpy array. This is so because most sklearn estimators expect a numpy array as input. If however we want the output of the mapper to be a dataframe, we can do so using the parameter ``df_out`` when creating the mapper::

>>> mapper_df = DataFrameMapper([
... ('pet', sklearn.preprocessing.LabelBinarizer()),
... (['children'], sklearn.preprocessing.StandardScaler())
... ], df_out=True)
>>> np.round(mapper_df.fit_transform(data.copy()), 2)
pet_cat pet_dog pet_fish children
0 1.0 0.0 0.0 0.21
1 0.0 1.0 0.0 1.88
2 0.0 1.0 0.0 -0.63
3 0.0 0.0 1.0 -0.63
4 1.0 0.0 0.0 -1.46
5 0.0 1.0 0.0 -0.63
6 1.0 0.0 0.0 1.04
7 0.0 0.0 1.0 0.21

Note this does not work together with the ``default=True`` or ``sparse=True`` arguments to the mapper.

Transform Multiple Columns
**************************

Expand Down Expand Up @@ -229,6 +252,13 @@ Sklearn-pandas' ``cross_val_score`` function provides exactly the same interface
Changelog
---------

Development
***********

* Make the mapper return dataframes when ``df_out=True`` (#70).
* Update imports to avoid deprecation warnings in sklearn 0.18 (#68).


1.2.0 (2016-10-02)
******************

Expand Down Expand Up @@ -272,9 +302,10 @@ The code for ``DataFrameMapper`` is based on code originally written by `Ben Ham

Other contributors:

* Paul Butler
* Cal Paterson
* Israel Saeta Pérez
* Zac Stewart
* Jeremy Howard
* Olivier Grisel
* Paul Butler
* Vitaley Zaretskey
* Zac Stewart
36 changes: 33 additions & 3 deletions sklearn_pandas/dataframe_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ class DataFrameMapper(BaseEstimator, TransformerMixin):
sklearn transformation.
"""

def __init__(self, features, default=False, sparse=False):
def __init__(self, features, default=False, sparse=False, df_out=False):
"""
Params:

Expand All @@ -50,13 +50,23 @@ def __init__(self, features, default=False, sparse=False):

sparse will return sparse matrix if set True and any of the
extracted features is sparse. Defaults to False.

df_out return a pandas data frame, with each column named using
the pandas column that created it (if there's only one
input and output) or the input columns joined with '_'
if there's multiple inputs, and the name concatenated with
'_1', '_2' etc if there's multiple outputs. NB: does not
work if *default* or *sparse* are true
"""
if isinstance(features, list):
features = [(columns, _build_transformer(transformers))
for (columns, transformers) in features]
self.features = features
self.default = _build_transformer(default)
self.sparse = sparse
self.df_out = df_out
if (df_out and (sparse or default)):
raise ValueError("Can not use df_out with sparse or default")

@property
def _selected_columns(self):
Expand Down Expand Up @@ -94,6 +104,7 @@ def __setstate__(self, state):
# compatibility shim for pickles created before ``default`` init
# argument existed
self.default = state.get('default', False)
self.df_out = state.get('df_out', False)

def _get_col_subset(self, X, cols):
"""
Expand Down Expand Up @@ -145,13 +156,26 @@ def fit(self, X, y=None):
self._get_col_subset(X, self._unselected_columns(X)), y)
return self


def get_names(self, c, t, x):
if type(c)==list:
c = '_'.join(c)
if hasattr(t, 'classes_') and (len(t.classes_)>2):
return [c + '_' + o for o in t.classes_]
elif len(x.shape)>1 and x.shape[1]>1:
return [c + '_' + str(o) for o in range(x.shape[1])]
else:
return [c]


def transform(self, X):
"""
Transform the given data. Assumes that fit has already been called.

X the data to transform
"""
extracted = []
index = []
for columns, transformers in self.features:
# columns could be a string or list of
# strings; we don't care because pandas
Expand All @@ -160,10 +184,13 @@ def transform(self, X):
if transformers is not None:
Xt = transformers.transform(Xt)
extracted.append(_handle_feature(Xt))
if self.df_out:
index = index + self.get_names(columns, transformers, Xt)

# handle features not explicitly selected
if self.default is not False:
Xt = self._get_col_subset(X, self._unselected_columns(X))
unsel_cols = self._unselected_columns(X)
Xt = self._get_col_subset(X, unsel_cols)
if self.default is not None:
Xt = self.default.transform(Xt)
extracted.append(_handle_feature(Xt))
Expand All @@ -185,4 +212,7 @@ def transform(self, X):
else:
stacked = np.hstack(extracted)

return stacked
if not self.df_out:
return stacked

return pd.DataFrame(stacked, columns=index)
83 changes: 82 additions & 1 deletion tests/test_dataframe_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,11 @@
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Imputer, StandardScaler, OneHotEncoder
from sklearn.preprocessing import (
Imputer, StandardScaler, OneHotEncoder, LabelBinarizer)
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.base import BaseEstimator, TransformerMixin
import sklearn.decomposition
import numpy as np
from numpy.testing import assert_array_equal
import pickle
Expand Down Expand Up @@ -77,6 +79,85 @@ def complex_dataframe():
'feat2': [1, 2, 3, 2, 3, 4]})


def test_simple_df(simple_dataframe):
"""
Get a dataframe from a simple mapped dataframe
"""
df = simple_dataframe
mapper = DataFrameMapper([('a', None)], df_out=True)
transformed = mapper.fit_transform(df)
assert type(transformed) == pd.DataFrame
assert len(transformed["a"]) == len(simple_dataframe["a"])


def test_complex_df(complex_dataframe):
"""
Get a dataframe from a complex mapped dataframe
"""
df = complex_dataframe
mapper = DataFrameMapper(
[('target', None), ('feat1', None), ('feat2', None)],
df_out=True)
transformed = mapper.fit_transform(df)
assert len(transformed) == len(complex_dataframe)
for c in df.columns:
assert len(transformed[c]) == len(df[c])


def test_binarizer_df():
"""
Check level names from LabelBinarizer
"""
df = pd.DataFrame({'target': ['a', 'a', 'b', 'b', 'c', 'a']})
mapper = DataFrameMapper([('target', LabelBinarizer())], df_out=True)
transformed = mapper.fit_transform(df)
cols = transformed.columns
assert len(cols) == 3
assert cols[0] == 'target_a'
assert cols[1] == 'target_b'
assert cols[2] == 'target_c'


def test_binarizer2_df():
"""
Check level names from LabelBinarizer with just one output column
"""
df = pd.DataFrame({'target': ['a', 'a', 'b', 'b', 'a']})
mapper = DataFrameMapper([('target', LabelBinarizer())], df_out=True)
transformed = mapper.fit_transform(df)
cols = transformed.columns
assert len(cols) == 1
assert cols[0] == 'target'


def test_onehot_df():
"""
Check level ids from one-hot
"""
df = pd.DataFrame({'target': [0, 0, 1, 1, 2, 3, 0]})
mapper = DataFrameMapper([(['target'], OneHotEncoder())], df_out=True)
transformed = mapper.fit_transform(df)
cols = transformed.columns
assert len(cols) == 4
assert cols[0] == 'target_0'
assert cols[3] == 'target_3'


def test_pca(complex_dataframe):
"""
Check multi in and out with PCA
"""
df = complex_dataframe
mapper = DataFrameMapper(
[(['feat1', 'feat2'], sklearn.decomposition.PCA(2))],
df_out=True)
transformed = mapper.fit_transform(df)
cols = transformed.columns
assert len(cols) == 2
assert cols[0] == 'feat1_feat2_0'
assert cols[1] == 'feat1_feat2_1'


def test_nonexistent_columns_explicit_fail(simple_dataframe):
"""
If a nonexistent column is selected, KeyError is raised.
Expand Down