# DataFrameMapper
- https://github.com/scikit-learn-contrib/sklearn-pandas

In [1]:
from sklearn_pandas import DataFrameMapper

In [6]:
sklearn_pandas.__version__

'1.8.0'

In [2]:
import pandas as pd
import numpy as np
import sklearn.preprocessing, sklearn.decomposition, \
     sklearn.linear_model, sklearn.pipeline, sklearn.metrics, \
     sklearn.compose
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
data = pd.DataFrame({'pet':      ['cat', 'dog', 'dog', 'fish', 'cat', 'dog', 'cat', 'fish'],
                      'children': [4., 6, 3, 3, 2, 3, 5, 4],
                      'salary':   [90., 24, 44, 27, 32, 59, 36, 27]})
data

Unnamed: 0,pet,children,salary
0,cat,4.0,90.0
1,dog,6.0,24.0
2,dog,3.0,44.0
3,fish,3.0,27.0
4,cat,2.0,32.0
5,dog,3.0,59.0
6,cat,5.0,36.0
7,fish,4.0,27.0


## Transformation Mapping

### Map the Columns to Transformations
- The mapper takes `a list of tuples`. Each tuple has three elements:
1. column name(s): The first element is `a column name from the pandas DataFrame`, or `a list containing one or multiple columns` or an instance of a callable function such as [make_column_selector](https://scikit-learn.org/stable/modules/generated/sklearn.compose.make_column_selector.html).
2. transformer(s): The second element is `an object which will perform the transformation` which will be applied to that column.
3. attributes: The third one is optional and is `a dictionary containing the transformation options`, if applicable.

In [9]:
mapper = DataFrameMapper([
     ('pet', sklearn.preprocessing.LabelBinarizer()),
     (['children'], sklearn.preprocessing.StandardScaler())
 ])

In [11]:
# In the first case, a one dimensional array will be passed, while in the second case it will be a 2-dimensional array with one column, i.e. a column vector.
## This behaviour mimics the same pattern as pandas' dataframes __getitem__ indexing
print(data['children'].shape)
print(data[['children']].shape)

(8,)
(8, 1)


In [12]:
data

Unnamed: 0,pet,children,salary
0,cat,4.0,90.0
1,dog,6.0,24.0
2,dog,3.0,44.0
3,fish,3.0,27.0
4,cat,2.0,32.0
5,dog,3.0,59.0
6,cat,5.0,36.0
7,fish,4.0,27.0


In [13]:
mapper.fit_transform(data.copy())

array([[ 1.        ,  0.        ,  0.        ,  0.20851441],
       [ 0.        ,  1.        ,  0.        ,  1.87662973],
       [ 0.        ,  1.        ,  0.        , -0.62554324],
       [ 0.        ,  0.        ,  1.        , -0.62554324],
       [ 1.        ,  0.        ,  0.        , -1.4596009 ],
       [ 0.        ,  1.        ,  0.        , -0.62554324],
       [ 1.        ,  0.        ,  0.        ,  1.04257207],
       [ 0.        ,  0.        ,  1.        ,  0.20851441]])

In [14]:
np.round(mapper.fit_transform(data.copy()), 2)

array([[ 1.  ,  0.  ,  0.  ,  0.21],
       [ 0.  ,  1.  ,  0.  ,  1.88],
       [ 0.  ,  1.  ,  0.  , -0.63],
       [ 0.  ,  0.  ,  1.  , -0.63],
       [ 1.  ,  0.  ,  0.  , -1.46],
       [ 0.  ,  1.  ,  0.  , -0.63],
       [ 1.  ,  0.  ,  0.  ,  1.04],
       [ 0.  ,  0.  ,  1.  ,  0.21]])

- the first three columns are the output of the LabelBinarizer (corresponding to `cat`, `dog`, and `fish` respectively)
- the fourth column is `the standardized value` for the number of `children`. 
- In general, the columns are ordered according to the order given when the DataFrameMapper is constructed.

In [15]:
# Now that the transformation is trained, we confirm that it works on new data
sample = pd.DataFrame({'pet': ['cat'], 'children': [5.]})
sample

Unnamed: 0,pet,children
0,cat,5.0


In [16]:
np.round(mapper.transform(sample), 2)

array([[1.  , 0.  , 0.  , 1.04]])

In [17]:
sample2 = pd.DataFrame({'pet': ['cat'], 'children': [5.], 'parent': [100]})
sample2

Unnamed: 0,pet,children,parent
0,cat,5.0,100


In [18]:
np.round(mapper.transform(sample2), 2)

array([[1.  , 0.  , 0.  , 1.04]])

### Output features names
- In certain cases, like when studying `the feature importances` for some model, we want to be able to associate **the original features to the ones generated by the dataframe mapper**. We can do so by inspecting the automatically generated `transformed_names_` attribute of the mapper after transformation

In [19]:
mapper.transformed_names_

['pet_cat', 'pet_dog', 'pet_fish', 'children']

### Custom column names for transformed features
- We can provide a custom name for the transformed features, to be used instead of the automatically generated one, by specifying it as the third argument of the feature definition

In [10]:
mapper_alias = DataFrameMapper([
     (['children'], sklearn.preprocessing.StandardScaler(),
      {'alias': 'children_scaled'})
        ])

In [11]:
_ = mapper_alias.fit_transform(data.copy())
mapper_alias.transformed_names_

['children_scaled']

- Alternatively, you can also specify prefix and/or suffix to add to the column name

In [12]:
mapper_alias = DataFrameMapper([
     (['children'], sklearn.preprocessing.StandardScaler(), {'prefix': 'standard_scaled_'}),
     (['children'], sklearn.preprocessing.StandardScaler(), {'suffix': '_raw'}) ])

In [13]:
_ = mapper_alias.fit_transform(data.copy())
mapper_alias.transformed_names_

['children', 'children']