# In this notebook, we explore the ColumnTransformer on Titanic dataset. The key objective is to show you how to access feature names after being encoded in the pipeline using `OneHotEncoder` function

In [18]:
from sklearn.datasets import fetch_openml

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [19]:
#import titanic dataset
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

In [20]:
X.head(5)

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1.0,"Allison, Miss. Helen Loraine",female,2.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1.0,2.0,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


# Identify numerical vs categorical columns

In [21]:
num_features=['age','fare']
cat_features=['pclass','pclass','embarked']

#Now creating pipeline for each type of features

In [44]:
#our num_transformer first imputes missing values and then scale them
num_transformer = Pipeline(steps=[
                    ('imputer',SimpleImputer(strategy='median')),
                    ('scaler',StandardScaler())
])


#Our cat_transformer will do onehotencoding
cat_transformer = Pipeline(steps=[
                                  ('imputer', SimpleImputer(strategy='most_frequent')),
                                  ('OneHotEncoder',OneHotEncoder(handle_unknown='ignore'))
                                  ])


#Define columnTransformer object


In [38]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)])

#Ready to train a model - lets use our friendly Logit

In [43]:
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression())])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))

NameError: ignored

# Lets see feature names

In [25]:
ohe_feature_names = clf['preprocessor'].transformers_[1][1]['OneHotEncoder'].get_feature_names(cat_features)
ohe_feature_names

array(['pclass_1.0', 'pclass_2.0', 'pclass_3.0', 'pclass_1.0',
       'pclass_2.0', 'pclass_3.0', 'embarked_C', 'embarked_Q',
       'embarked_S'], dtype=object)

In [26]:
# Your numerical features did not literally changed, but in case you want to get them from the Pipeline
num_feature_names = clf['preprocessor'].transformers_[0][2]
num_feature_names

['age', 'fare']

In [27]:
clf['classifier'].coef_

array([[-0.48835412,  0.23843633,  0.39919666,  0.04182808, -0.4411823 ,
         0.39919666,  0.04182808, -0.4411823 ,  0.17328065,  0.18031678,
        -0.35375498]])

# Note
The order of the columns in the transformed feature matrix follows the order of how the columns are specified in the transformers list. Columns of the original feature matrix that are not specified are dropped from the resulting transformed feature matrix, unless specified in the passthrough keyword. Those columns specified with passthrough are added at the right to the output of the transformers.

Ref: [ColumnTransformer Documentation](https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html)

In [28]:
clf['classifier']

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)