# Column Transformer:
 
  Applies transformers to columns of an array or pandas DataFrame. This estimator allows different columns or column subsets of the input to be transformed separately and the features generated by each transformer will be concatenated to form a single feature space.
  
   **For example:**
  
![](https://imgur.com/FJ9nlnQ.png)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data Preparation

In [None]:
import pandas as pd
train = pd.read_csv('/kaggle/input/home-loan/train.csv')
test = pd.read_csv('/kaggle/input/home-loan/test.csv')
print(train.shape, test.shape)
print(train.dtypes)

In [None]:
train = train.drop(['Loan_ID'], axis=1)
test =  test.drop(['Loan_ID'], axis=1)
train = train.apply(lambda x:x.fillna(x.value_counts().index[0]))
test = test.apply(lambda x:x.fillna(x.value_counts().index[0]))

In [None]:
feature_set = train.drop(['Loan_Status'], axis=1)
X = feature_set.columns[:len(feature_set.columns)]
y = 'Loan_Status'
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    train[X], train[y], random_state=0)

# Column Transformer

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import Normalizer, OneHotEncoder
colT = ColumnTransformer(
    [("dummy_col", OneHotEncoder(categories=[['Male', 'Female'],
                                           ['Yes', 'No'],
                                            ['0','1', '2','3+'],
                                            ['Graduate', 'Not Graduate'],
                                            ['No', 'Yes'],
                                            ['Semiurban', 'Urban', 'Rural']]), [0,1,2,3,4,10]),
      ("norm", Normalizer(norm='l1'), [5,6,7,8,9])])

In [None]:
X_train = colT.fit_transform(X_train)
X_train

In [None]:
X_test = colT.transform(X_test)

# Training the Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)
print(classification_report(y_test, y_pred, target_names=['Y', 'N']))

# Predicting new data

In [None]:
test_samp = test[:15]
test_samp = colT.transform(test_samp)
random_forest.predict(test_samp)

***Pros:***

* Get rid of handling details between two stages.
* Code is easy to maintain.
* Use different feature transformer without seperating your code into several parts and compose them.

***Cons:***

* Cannot apply different transformation by different features.
* Cannot direct send pandas dataframe and use dict-like way to access data in your pipeline.
* Need to care many details with numpy / scipy interface.


# References:

1. https://medium.com/vickdata/easier-machine-learning-with-the-new-column-transformer-from-scikit-learn-c2268ea9564c
 
2. https://towardsdatascience.com/columntransformer-in-scikit-for-labelencoding-and-onehotencoding-in-machine-learning-c6255952731b

