# DataFrameMapper
- https://github.com/scikit-learn-contrib/sklearn-pandas

In [3]:
import sklearn_pandas
sklearn_pandas.__version__

'1.8.0'

In [4]:
from sklearn_pandas import DataFrameMapper

In [5]:
import pandas as pd
import numpy as np
import sklearn.preprocessing, sklearn.decomposition, \
     sklearn.linear_model, sklearn.pipeline, sklearn.metrics, \
     sklearn.compose
from sklearn.feature_extraction.text import CountVectorizer

In [13]:
data = pd.DataFrame({'pet':      ['cat', 'dog', 'dog', 'fish', 'cat', 'dog', 'cat', 'fish'],
                      'children': [4., 6, 3, 3, 2, 3, 5, 4],
                      'salary':   [90., 24, 44, 27, 32, 59, 36, 27]})
data

Unnamed: 0,pet,children,salary
0,cat,4.0,90.0
1,dog,6.0,24.0
2,dog,3.0,44.0
3,fish,3.0,27.0
4,cat,2.0,32.0
5,dog,3.0,59.0
6,cat,5.0,36.0
7,fish,4.0,27.0


## Transformation Mapping

### Map the Columns to Transformations
- The mapper takes `a list of tuples`. Each tuple has three elements:
1. column name(s): The first element is `a column name from the pandas DataFrame`, or `a list containing one or multiple columns` or an instance of a callable function such as [make_column_selector](https://scikit-learn.org/stable/modules/generated/sklearn.compose.make_column_selector.html).
2. transformer(s): The second element is `an object which will perform the transformation` which will be applied to that column.
3. attributes: The third one is optional and is `a dictionary containing the transformation options`, if applicable.

In [5]:
mapper = DataFrameMapper([
     ('pet', sklearn.preprocessing.LabelBinarizer()),
     (['children'], sklearn.preprocessing.StandardScaler())
 ])

In [6]:
# In the first case, a one dimensional array will be passed, while in the second case it will be a 2-dimensional array with one column, i.e. a column vector.
## This behaviour mimics the same pattern as pandas' dataframes __getitem__ indexing
print(data['children'].shape)
print(data[['children']].shape)

(8,)
(8, 1)


In [7]:
data

Unnamed: 0,pet,children,salary
0,cat,4.0,90.0
1,dog,6.0,24.0
2,dog,3.0,44.0
3,fish,3.0,27.0
4,cat,2.0,32.0
5,dog,3.0,59.0
6,cat,5.0,36.0
7,fish,4.0,27.0


In [8]:
mapper.fit_transform(data.copy())

array([[ 1.        ,  0.        ,  0.        ,  0.20851441],
       [ 0.        ,  1.        ,  0.        ,  1.87662973],
       [ 0.        ,  1.        ,  0.        , -0.62554324],
       [ 0.        ,  0.        ,  1.        , -0.62554324],
       [ 1.        ,  0.        ,  0.        , -1.4596009 ],
       [ 0.        ,  1.        ,  0.        , -0.62554324],
       [ 1.        ,  0.        ,  0.        ,  1.04257207],
       [ 0.        ,  0.        ,  1.        ,  0.20851441]])

In [9]:
np.round(mapper.fit_transform(data.copy()), 2)

array([[ 1.  ,  0.  ,  0.  ,  0.21],
       [ 0.  ,  1.  ,  0.  ,  1.88],
       [ 0.  ,  1.  ,  0.  , -0.63],
       [ 0.  ,  0.  ,  1.  , -0.63],
       [ 1.  ,  0.  ,  0.  , -1.46],
       [ 0.  ,  1.  ,  0.  , -0.63],
       [ 1.  ,  0.  ,  0.  ,  1.04],
       [ 0.  ,  0.  ,  1.  ,  0.21]])

- the first three columns are the output of the LabelBinarizer (corresponding to `cat`, `dog`, and `fish` respectively)
- the fourth column is `the standardized value` for the number of `children`. 
- In general, the columns are ordered according to the order given when the DataFrameMapper is constructed.

In [10]:
# Now that the transformation is trained, we confirm that it works on new data
sample = pd.DataFrame({'pet': ['cat'], 'children': [5.]})
sample

Unnamed: 0,pet,children
0,cat,5.0


In [11]:
np.round(mapper.transform(sample), 2)

array([[1.  , 0.  , 0.  , 1.04]])

In [12]:
sample2 = pd.DataFrame({'pet': ['cat'], 'children': [5.], 'parent': [100]})
sample2

Unnamed: 0,pet,children,parent
0,cat,5.0,100


In [13]:
np.round(mapper.transform(sample2), 2)

array([[1.  , 0.  , 0.  , 1.04]])

### Output features names
- In certain cases, like when studying `the feature importances` for some model, we want to be able to associate **the original features to the ones generated by the dataframe mapper**. We can do so by inspecting the automatically generated `transformed_names_` attribute of the mapper after transformation

In [14]:
mapper.transformed_names_

['pet_cat', 'pet_dog', 'pet_fish', 'children']

### Custom column names for transformed features
- We can provide a custom name for the transformed features, to be used instead of the automatically generated one, by specifying it as the third argument of the feature definition

In [15]:
mapper_alias = DataFrameMapper([
     (['children'], sklearn.preprocessing.StandardScaler(),
      {'alias': 'children_scaled'})
        ])

In [16]:
_ = mapper_alias.fit_transform(data.copy())
mapper_alias.transformed_names_

['children_scaled']

- Alternatively, you can also specify prefix and/or suffix to add to the column name

In [17]:
mapper_alias = DataFrameMapper([
     (['children'], sklearn.preprocessing.StandardScaler(), {'prefix': 'standard_scaled_'}),
     (['children'], sklearn.preprocessing.StandardScaler(), {'suffix': '_raw'}) ])

In [18]:
_ = mapper_alias.fit_transform(data.copy())
mapper_alias.transformed_names_

['children', 'children']

### Dynamic Columns
- In some situations the columns are not known before hand and we would like to `dynamically select them during the fit operation`. 
- As shown below, in such situations you can provide either a custom callable or use [make_column_selector](https://scikit-learn.org/stable/modules/generated/sklearn.compose.make_column_selector.html).

In [11]:
class GetColumnsStartingWith:
    def __init__(self, start_str):
        self.pattern = start_str
    
    def __call__(self, X:pd.DataFrame=None):
        return [c for c in X.columns if c.startswith(self.pattern)]

In [12]:
df = pd.DataFrame({
    'sepal length (cm)': [1.0, 2.0, 3.0],
    'sepal width (cm)': [1.0, 2.0, 3.0],
    'petal length (cm)': [1.0, 2.0, 3.0],
    'petal width (cm)': [1.0, 2.0, 3.0]
 })

In [13]:
t = DataFrameMapper([
    (
        sklearn.compose.make_column_selector(dtype_include=float),
        sklearn.preprocessing.StandardScaler(),
        {'alias': 'x'}
    ),
    (
        GetColumnsStartingWith('petal'), # column name(s)
        None, # transformer(s)
        {'alias': 'petal'} # attributes
    )], df_out=True, default=False)

In [14]:
t.fit(df).transform(df).shape

(3, 6)

In [15]:
t.transformed_names_

['x_0', 'x_1', 'x_2', 'x_3', 'petal_0', 'petal_1']

In [16]:
t

In [17]:
t.fit(df).transform(df)

Unnamed: 0,x_0,x_1,x_2,x_3,petal_0,petal_1
0,-1.224745,-1.224745,-1.224745,-1.224745,1.0,1.0
1,0.0,0.0,0.0,0.0,2.0,2.0
2,1.224745,1.224745,1.224745,1.224745,3.0,3.0


In [18]:
t1 = DataFrameMapper([
    # (
    #     sklearn.compose.make_column_selector(dtype_include=float),
    #     sklearn.preprocessing.StandardScaler(),
    #     {'alias': 'x'}
    # ),
    (
        GetColumnsStartingWith('petal'), # column name(s)
        None, # transformer(s)
        {'alias': 'petal'} # attributes
    )], df_out=True, default=None)

In [19]:
t1.fit(df).transform(df)

Unnamed: 0,petal_0,petal_1,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,1.0,1.0,1.0,1.0,1.0,1.0
1,2.0,2.0,2.0,2.0,2.0,2.0
2,3.0,3.0,3.0,3.0,3.0,3.0


In [24]:
t1.transformed_names_

['petal_0',
 'petal_1',
 'sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [20]:
t2 = DataFrameMapper([
    # (
    #     sklearn.compose.make_column_selector(dtype_include=float),
    #     sklearn.preprocessing.StandardScaler(),
    #     {'alias': 'x'}
    # ),
    (
        GetColumnsStartingWith('petal'), # column name(s)
        None, # transformer(s)
        {'alias': 'petal'} # attributes
    )], df_out=True, default=False)

In [21]:
t2.fit(df).transform(df)

Unnamed: 0,petal_0,petal_1
0,1.0,1.0
1,2.0,2.0
2,3.0,3.0


In [25]:
t2.transformed_names_

['petal_0', 'petal_1']

In [22]:
t3 = DataFrameMapper([
    # (
    #     sklearn.compose.make_column_selector(dtype_include=float),
    #     sklearn.preprocessing.StandardScaler(),
    #     {'alias': 'x'}
    # ),
    (
        GetColumnsStartingWith('petal'), # column name(s)
        None, # transformer(s)
        {'alias': 'petal'} # attributes
    )], df_out=False, default=False)

In [23]:
t3.fit(df).transform(df)

array([[1., 1.],
       [2., 2.],
       [3., 3.]])

In [26]:
t3.transformed_names_

['petal_0', 'petal_1']

### Passing Series/DataFrames to the transformers
- we can pass a dataframe/series to the transformers to handle custom cases initializing the dataframe mapper with `input_df=True`

In [1]:
from sklearn.base import TransformerMixin

class DateEncoder(TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        dt = X.dt
        return pd.concat([dt.year, dt.month, dt.day], axis=1)

In [6]:
dates_df = pd.DataFrame({'dates': pd.date_range('2015-10-30', '2015-11-02')})
dates_df

Unnamed: 0,dates
0,2015-10-30
1,2015-10-31
2,2015-11-01
3,2015-11-02


In [9]:
mapper_dates = DataFrameMapper([
    (
        'dates', DateEncoder()
    )], input_df=True)

In [10]:
mapper_dates.fit_transform(dates_df)

array([[2015,   10,   30],
       [2015,   10,   31],
       [2015,   11,    1],
       [2015,   11,    2]], dtype=int64)

- We can also specify this option per group of columns instead of for the whole mapper:

In [11]:
mapper_dates = DataFrameMapper([
    ('dates', DateEncoder(), {'input_df': True})])

In [12]:
mapper_dates.fit_transform(dates_df)

array([[2015,   10,   30],
       [2015,   10,   31],
       [2015,   11,    1],
       [2015,   11,    2]], dtype=int64)

### Dropping columns explictly
- Sometimes it is required to drop a specific column/ list of columns. For this purpose, `drop_cols` argument for DataFrameMapper can be used. Default value is `None`:

In [14]:
data

Unnamed: 0,pet,children,salary
0,cat,4.0,90.0
1,dog,6.0,24.0
2,dog,3.0,44.0
3,fish,3.0,27.0
4,cat,2.0,32.0
5,dog,3.0,59.0
6,cat,5.0,36.0
7,fish,4.0,27.0


In [15]:
mapper_df = DataFrameMapper([
     ('pet', sklearn.preprocessing.LabelBinarizer()),
     (['children'], sklearn.preprocessing.StandardScaler())
     ], drop_cols=['salary'])

TypeError: __init__() got an unexpected keyword argument 'drop_cols'

### Transform Multiple Columns
- Transformations may require multiple input columns. In these cases, the column names can be specified in a list:

In [21]:
data

Unnamed: 0,pet,children,salary
0,cat,4.0,90.0
1,dog,6.0,24.0
2,dog,3.0,44.0
3,fish,3.0,27.0
4,cat,2.0,32.0
5,dog,3.0,59.0
6,cat,5.0,36.0
7,fish,4.0,27.0


In [16]:
mapper2 = DataFrameMapper([
     (['children', 'salary'], sklearn.decomposition.PCA(1))
 ])

In [17]:
np.round(mapper2.fit_transform(data.copy()), 1)

array([[ 47.6],
       [-18.4],
       [  1.6],
       [-15.4],
       [-10.4],
       [ 16.6],
       [ -6.4],
       [-15.4]])

### Multiple transformers for the same column
- Multiple transformers can be applied to the same column specifying them in a list:

In [25]:
from sklearn.impute import SimpleImputer
mapper3 = DataFrameMapper([
     (['age'], [SimpleImputer(), sklearn.preprocessing.StandardScaler()])])

In [26]:
data_3 = pd.DataFrame({'age': [1, np.nan, 3]})
data_3

Unnamed: 0,age
0,1.0
1,
2,3.0


In [27]:
mapper3.fit_transform(data_3)

array([[-1.22474487],
       [ 0.        ],
       [ 1.22474487]])

### Columns that don't need any transformation
- Only columns that are listed in the DataFrameMapper are kept. To keep a column but don't apply any transformation to it, use `None` as transformer:

In [30]:
mapper3 = DataFrameMapper([
     ('pet', sklearn.preprocessing.LabelBinarizer()),
     ('children', None)
 ], df_out=True)

In [31]:
np.round(mapper3.fit_transform(data.copy()))

Unnamed: 0,pet_cat,pet_dog,pet_fish,children
0,1,0,0,4.0
1,0,1,0,6.0
2,0,1,0,3.0
3,0,0,1,3.0
4,1,0,0,2.0
5,0,1,0,3.0
6,1,0,0,5.0
7,0,0,1,4.0


### Applying a default transformer
- A default transformer can be applied to columns not explicitly selected passing it as the default argument to the mapper:
- Using `default=False` (the default) drops unselected columns. Using `default=None` pass the unselected columns unchanged.

In [37]:
mapper4 = DataFrameMapper([
     ('pet', sklearn.preprocessing.LabelBinarizer()),
     ('children', None)
 ], default=sklearn.preprocessing.StandardScaler())

In [38]:
data

Unnamed: 0,pet,children,salary
0,cat,4.0,90.0
1,dog,6.0,24.0
2,dog,3.0,44.0
3,fish,3.0,27.0
4,cat,2.0,32.0
5,dog,3.0,59.0
6,cat,5.0,36.0
7,fish,4.0,27.0


In [39]:
np.round(mapper4.fit_transform(data.copy()), 1)

array([[ 1. ,  0. ,  0. ,  4. ,  2.3],
       [ 0. ,  1. ,  0. ,  6. , -0.9],
       [ 0. ,  1. ,  0. ,  3. ,  0.1],
       [ 0. ,  0. ,  1. ,  3. , -0.7],
       [ 1. ,  0. ,  0. ,  2. , -0.5],
       [ 0. ,  1. ,  0. ,  3. ,  0.8],
       [ 1. ,  0. ,  0. ,  5. , -0.3],
       [ 0. ,  0. ,  1. ,  4. , -0.7]])

In [40]:
mapper4 = DataFrameMapper([
     ('pet', sklearn.preprocessing.LabelBinarizer()),
     ('children', None)
 ], df_out=True, default=sklearn.preprocessing.StandardScaler())

ValueError: Can not use df_out with sparse or default

### Same transformer for the multiple columns
- the package provides gen_features function which accepts a list of columns and feature transformer class (or list of classes), and generates a feature definition, acceptable by DataFrameMapper.
- For example, consider a dataset with three categorical columns, 'col1', 'col2', and 'col3', To binarize each of them, one could pass column names and `LabelBinarizer` transformer class into generator, and then use returned definition as features argument for DataFrameMapper:

In [41]:
from sklearn_pandas import gen_features

feature_def = gen_features(
    columns=['col1', 'col2', 'col3'],
    classes=[sklearn.preprocessing.LabelEncoder]
)

In [42]:
feature_def

[('col1', [LabelEncoder()]),
 ('col2', [LabelEncoder()]),
 ('col3', [LabelEncoder()])]

In [44]:
mapper5 = DataFrameMapper(feature_def)
mapper5

In [45]:
data5 = pd.DataFrame({
     'col1': ['yes', 'no', 'yes'],
     'col2': [True, False, False],
     'col3': ['one', 'two', 'three']
 })

In [46]:
mapper5.fit_transform(data5)

array([[1, 1, 0],
       [0, 0, 2],
       [1, 0, 1]], dtype=int64)

- You can also specify global prefix or suffix for the generated transformed column names using the prefix and suffix parameters:

In [47]:
feature_def = gen_features(
     columns=['col1', 'col2', 'col3'],
     classes=[sklearn.preprocessing.LabelEncoder],
     prefix="lblencoder_"
 )

TypeError: gen_features() got an unexpected keyword argument 'prefix'

In [None]:
# mapper5 = DataFrameMapper(feature_def)
# data5 = pd.DataFrame({
#      'col1': ['yes', 'no', 'yes'],
#      'col2': [True, False, False],
#      'col3': ['one', 'two', 'three']
#  })
# _ = mapper5.fit_transform(data5)
# mapper5.transformed_names_
# ['lblencoder_col1', 'lblencoder_col2', 'lblencoder_col3']

### Feature selection and other supervised transformations
- `DataFrameMapper` supports transformers that require both `X` and `y` arguments. An example of this is feature selection. Treating the 'pet' column as the target, we will select the column that best predicts it.

In [48]:
from sklearn.feature_selection import SelectKBest, chi2

mapper_fs = DataFrameMapper([(['children','salary'], SelectKBest(chi2, k=1))])
mapper_fs

In [49]:
mapper_fs.fit_transform(data[['children','salary']], data['pet'])

array([[90.],
       [24.],
       [44.],
       [27.],
       [32.],
       [59.],
       [36.],
       [27.]])