In [1]:
import warnings
warnings.simplefilter("ignore", FutureWarning)

# scikit-learn
## Example: Pandas DataFrame output for sklearn transformer

### Outline
- Example 1a: iris dataset (`StandardScalar` transformation)
- Example 1b: iris dataset (`PolynomialFeatures` transformation)
- Example 2: titanic dataset (with a Pipeline)

### Prepared by
- Andreas Mueller
- Reshama Shaikh

### Date
November 2022

# Example 1a: iris dataset (`StandardScalar` transformation)

In [2]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

X, y = load_iris(as_frame=True, return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)
X_train

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
60,5.0,2.0,3.5,1.0
1,4.9,3.0,1.4,0.2
8,4.4,2.9,1.4,0.2
93,5.0,2.3,3.3,1.0
106,4.9,2.5,4.5,1.7
...,...,...,...,...
66,5.6,3.0,4.5,1.5
29,4.7,3.2,1.6,0.2
130,7.4,2.8,6.1,1.9
141,6.9,3.1,5.1,2.3


In [3]:
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

# set the transform method
scaler = StandardScaler()

# transform the training data
scaler.fit(X_train)

# transform the test data
X_test_scaled = scaler.transform(X_test)

# Let's look at the output.
# Input data:  X_train was a pandas dataframe
# Output data: X_test_scaled is a numpy array

X_test_scaled

array([[-0.89426443,  0.7983005 , -1.27141116, -1.32760471],
       [-1.24446588, -0.08694362, -1.32740725, -1.45907396],
       [-0.66079679,  1.46223359, -1.27141116, -1.32760471],
       [-0.89426443,  0.57698947, -1.15941899, -0.93319694],
       [-0.42732916, -1.4148098 , -0.03949724, -0.27585067],
       [-0.19386152, -0.52956568,  0.40847146,  0.1185571 ],
       [-0.19386152, -0.52956568,  0.18448711,  0.1185571 ],
       [-1.12773206,  0.13436741, -1.27141116, -1.45907396],
       [ 0.15633993,  0.7983005 ,  0.40847146,  0.51296486],
       [ 1.55714575, -0.08694362,  1.1364206 ,  0.51296486],
       [ 0.50654139,  0.57698947,  1.24841277,  1.69618815],
       [-0.31059534, -0.52956568,  0.63245581,  1.03884188],
       [-0.0771277 , -0.75087671,  0.18448711, -0.27585067],
       [ 0.50654139, -0.30825465,  1.02442842,  0.77590337],
       [-0.42732916, -1.19349877,  0.12849102,  0.1185571 ],
       [-0.89426443,  1.46223359, -1.27141116, -1.0646662 ],
       [-1.47793352,  0.

# Example 1b: iris dataset (`PolynomialFeatures` transformation)

In [4]:
# We transform the training data (X_train) to polynomial features
# Output: type is ...
PolynomialFeatures().fit_transform(X_train)

array([[ 1.  ,  5.  ,  2.  , ..., 12.25,  3.5 ,  1.  ],
       [ 1.  ,  4.9 ,  3.  , ...,  1.96,  0.28,  0.04],
       [ 1.  ,  4.4 ,  2.9 , ...,  1.96,  0.28,  0.04],
       ...,
       [ 1.  ,  7.4 ,  2.8 , ..., 37.21, 11.59,  3.61],
       [ 1.  ,  6.9 ,  3.1 , ..., 26.01, 11.73,  5.29],
       [ 1.  ,  6.4 ,  2.7 , ..., 28.09, 10.07,  3.61]])

### Use `transform_output`

In [5]:
from sklearn import set_config
set_config(transform_output="pandas")

In [6]:
# Output: type is ...
PolynomialFeatures().fit_transform(X_train)

Unnamed: 0,1,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),sepal length (cm)^2,sepal length (cm) sepal width (cm),sepal length (cm) petal length (cm),sepal length (cm) petal width (cm),sepal width (cm)^2,sepal width (cm) petal length (cm),sepal width (cm) petal width (cm),petal length (cm)^2,petal length (cm) petal width (cm),petal width (cm)^2
60,1.0,5.0,2.0,3.5,1.0,25.00,10.00,17.50,5.00,4.00,7.00,2.00,12.25,3.50,1.00
1,1.0,4.9,3.0,1.4,0.2,24.01,14.70,6.86,0.98,9.00,4.20,0.60,1.96,0.28,0.04
8,1.0,4.4,2.9,1.4,0.2,19.36,12.76,6.16,0.88,8.41,4.06,0.58,1.96,0.28,0.04
93,1.0,5.0,2.3,3.3,1.0,25.00,11.50,16.50,5.00,5.29,7.59,2.30,10.89,3.30,1.00
106,1.0,4.9,2.5,4.5,1.7,24.01,12.25,22.05,8.33,6.25,11.25,4.25,20.25,7.65,2.89
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,1.0,5.6,3.0,4.5,1.5,31.36,16.80,25.20,8.40,9.00,13.50,4.50,20.25,6.75,2.25
29,1.0,4.7,3.2,1.6,0.2,22.09,15.04,7.52,0.94,10.24,5.12,0.64,2.56,0.32,0.04
130,1.0,7.4,2.8,6.1,1.9,54.76,20.72,45.14,14.06,7.84,17.08,5.32,37.21,11.59,3.61
141,1.0,6.9,3.1,5.1,2.3,47.61,21.39,35.19,15.87,9.61,15.81,7.13,26.01,11.73,5.29


---

# Example 2: titanic dataset (with a Pipeline)

In [7]:
# Let's go back to the default settings
set_config(transform_output="default")

In [8]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [9]:
X_train

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
868,3.0,"Holm, Mr. John Fredrik Alexander",male,43.00,0.0,0.0,C 7075,6.4500,,S,,,
974,3.0,"Lobb, Mr. William Arthur",male,30.00,1.0,0.0,A/5. 3336,16.1000,,S,,,
699,3.0,"Cacic, Mr. Luka",male,38.00,0.0,0.0,315089,8.6625,,S,,,Croatia
1044,3.0,"Murphy, Miss. Nora",female,,0.0,0.0,36568,15.5000,,Q,16,,
545,2.0,"Renouf, Mrs. Peter Henry (Lillian Jefferys)",female,30.00,3.0,0.0,31027,21.0000,,S,,,"Elizabeth, NJ"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1111,3.0,"Peacock, Master. Alfred Edward",male,0.75,1.0,1.0,SOTON/O.Q. 3101315,13.7750,,S,,,
1123,3.0,"Peter, Mrs. Catherine (Catherine Rizk)",female,,0.0,2.0,2668,22.3583,,C,D,,
112,1.0,"Fortune, Miss. Ethel Flora",female,28.00,3.0,2.0,19950,263.0000,C23 C25 C27,S,10,,"Winnipeg, MB"
1303,3.0,"Yousseff, Mr. Gerious",male,,0.0,0.0,2627,14.4583,,C,,,


In [10]:
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression

# Here we use `StandardScaler` for continuous variables; 
#      then we impute for missing data (check the documentation for the imputation method)
# We use `OneHotEncoder` for categorical variables
# NOTE: we are using a subset of the features (not all the columns)

ct = make_column_transformer((make_pipeline(SimpleImputer(), 
                                            StandardScaler()), ["age", "fare"]),
                             (OneHotEncoder(sparse=False), ["embarked", "sex", "pclass"]), 
                             verbose_feature_names_out=False)

# Note: click on pipeline elements to see more details
clf = make_pipeline(ct, LogisticRegression())
clf

In [11]:
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.7865853658536586

In [12]:
# Let's remove the last step in the pipeline (which is LogisticRegression()) & transform the X_test data

clf[:-1].transform(X_test)

array([[-0.44376706, -0.50468405,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [-0.44376706,  0.05957877,  1.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 0.47033007, -0.34728169,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.39415531,  0.45300503,  1.        , ...,  1.        ,
         0.        ,  0.        ],
       [ 0.01328151, -0.14910643,  0.        , ...,  1.        ,
         0.        ,  0.        ],
       [-0.82464087, -0.39564025,  0.        , ...,  0.        ,
         1.        ,  0.        ]])

In [13]:
import pandas as pd

In [14]:
# untransformed data (has all the orginal columns)
X_test.head(3)

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
720,3.0,"Colbert, Mr. Patrick",male,24.0,0.0,0.0,371109,7.25,,Q,,,"Co Limerick, Ireland Sherbrooke, PQ"
494,2.0,"Mallet, Mrs. Albert (Antoinette Magnin)",female,24.0,1.0,1.0,S.C./PARIS 2079,37.0042,,C,10,,"Paris / Montreal, PQ"
968,3.0,"Lindell, Mr. Edvard Bengtsson",male,36.0,1.0,0.0,349910,15.55,,S,A,,


In [15]:
X_test['embarked'].value_counts(normalize=False, dropna=False)

S      222
C       68
Q       37
NaN      1
Name: embarked, dtype: int64

In [16]:
X_test['sex'].value_counts(normalize=False, dropna=False)

male      211
female    117
Name: sex, dtype: int64

In [17]:
X_test['pclass'].value_counts(normalize=False, dropna=False)

3.0    184
1.0     80
2.0     64
Name: pclass, dtype: int64

In [18]:
# Notes
# a) Notice first two columns are numbers and from our transformer should be "age" and "fare"
# b) Next 4 cols are embarked (4 possible values); sex (2 possible values here); pclass (3 possible values)

In [19]:
# transformed data
pd.DataFrame(clf[:-1].transform(X_test))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,-0.443767,-0.504684,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,-0.443767,0.059579,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.470330,-0.347282,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
3,2.146175,2.268110,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
4,1.536777,1.376085,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
323,-1.662563,0.054758,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
324,2.450874,0.874957,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
325,0.394155,0.453005,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
326,0.013282,-0.149106,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0


In [20]:
set_config(transform_output="pandas")

In [21]:
clf.fit(X_train, y_train)

X_test_transformed = clf[:-1].transform(X_test)
X_test_transformed 

Unnamed: 0,age,fare,embarked_C,embarked_Q,embarked_S,embarked_nan,sex_female,sex_male,pclass_1.0,pclass_2.0,pclass_3.0
720,-0.443767,-0.504684,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
494,-0.443767,0.059579,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
968,0.470330,-0.347282,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
139,2.146175,2.268110,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
96,1.536777,1.376085,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
385,-1.662563,0.054758,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
284,2.450874,0.874957,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
257,0.394155,0.453005,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
194,0.013282,-0.149106,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0


In [22]:
import numpy as np
import sklearn
print(pd.__version__)
print(np.__version__)
print(sklearn.__version__)

1.3.3
1.21.2
1.2.dev0


---

# Resources

## Documentation
- Documentation of `set_output` API:  https://scikit-learn.org/dev/auto_examples/miscellaneous/plot_set_output.html#sphx-glr-auto-examples-miscellaneous-plot-set-output-py

## Try out the dev version
- scikit-learn 1.2.dev0
- Installing the nightly build:
http://scikit-learn.org/stable/developers/advanced_installation.html

## Report any bugs or issues
#### We'd love to hear both about whether this helps your use cases and any bugs you find!
#### We are also specifically looking for feedback from library authors on how their experience is and if this change introduces any unexpected hassles.
Post any issues or bugs here:  https://github.com/scikit-learn/scikit-learn/issues

# FAQs

### Q1: Why did this update take so long?
> There was no established dataframe when scikit-learn was initially released - and it was released in the context of scientific computing, in which dataframes make less sense. Adding it back in later was quite tricky because of the interactions of numpy and pandas, and the lack of annotated sparse formats.

### Q2: Will pandas output be supported also in other estimators?
> This is still work in progress! Let us know what your use-cases are! What would you like to see?

### Q3: Does this mean we could chain Column Transformers while referencing column names?
> Yes

### Q4: When will version 1.2 be released?
> End of 2022!