# ColumnTransforme
  -  class in the scikit-learn Python machine learning library
      -  allows you to selectively apply data preparation transforms. 
         -  it allows you to apply a specific transform or sequence of transforms to just the numerical columns
            - separate sequence of transforms to just the categorical columns.
         -  Each transformer is a three-element tuple (Name, Object, Columns)   
            - Example
            - transformer = ColumnTransformer(transformers=[('cat', OneHotEncoder(), [0, 1])]) 
            - name: defines the name of the transformer, 
              - cat: Category
              - num: Numerical
            - Object: the transform to apply
              - OneHotEncoder()
              - MinMaxScaler()
            - Column: column indices to apply it to. 
              - [0, 1] 

      - https://machinelearningmastery.com/columntransformer-for-numerical-and-categorical-data/

In [1]:





# example of using the ColumnTransformer for the Abalone dataset
from numpy import mean
from numpy import std
from numpy import absolute
from pandas import read_csv
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVR

In [3]:


url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/abalone.csv'
dataframe = read_csv(url, header=None)
# split into inputs and outputs
last_ix = len(dataframe.columns) - 1
X, y = dataframe.drop(last_ix, axis=1), dataframe[last_ix]
print(X.shape, y.shape)

(4177, 8) (4177,)


In [9]:
dataframe.head() # no implicit index

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [4]:
# determine categorical and numerical features
numerical_ix = X.select_dtypes(include=['int64', 'float64']).columns # int64Index([1, 2, 3, 4, 5, 6, 7], dtype='int64')
categorical_ix = X.select_dtypes(include=['object', 'bool']).columns # Int64Index([0], dtype='int64')

In [5]:
numerical_ix 

Int64Index([1, 2, 3, 4, 5, 6, 7], dtype='int64')

In [11]:
categorical_ix

Int64Index([0], dtype='int64')

In [7]:
# define the data preparation for the columns
t = [('cat', OneHotEncoder(), categorical_ix), ('num', MinMaxScaler(), numerical_ix)]
col_transform = ColumnTransformer(transformers=t)

In [8]:
# define the model
model = SVR(kernel='rbf',gamma='scale',C=100)
# define the data preparation and modeling pipeline
pipeline = Pipeline(steps=[('prep',col_transform), ('m', model)])
# define the model cross-validation configuration
cv = KFold(n_splits=10, shuffle=True, random_state=1)
# evaluate the pipeline using cross validation and calculate MAE
scores = cross_val_score(pipeline, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# convert MAE scores to positive values
scores = absolute(scores)
# summarize the model performance
print('MAE: %.3f (%.3f)' % (mean(scores), std(scores)))

MAE: 1.465 (0.047)
