# Column transformer

It is a class provided by the scikit-learn library in Python.

It is used for applying different preprocessing steps to different columns or subsets of columns in a dataset.

It allows you to define a pipeline of transformers that are applied to specific columns of your data. 

Useful when dealing with datasets that have heterogeneous data types or require different preprocessing steps for different features.

In [86]:
#normal way to do simple imputer, ordinal encoding, one hot encoding in one data frame and merge it. This process is long.

import numpy as np
import pandas as pd

def read_csv(file_name):
    data = pd.read_csv(file_name)
    return data

def split_data(data):
    from sklearn.model_selection import train_test_split
    X_train,X_test,Y_train,Y_test = train_test_split(data.drop(columns=['has_covid']),
                                                     df['has_covid'],
                                                     random_state=2)
    return X_train,X_test,Y_train,Y_test


# adding simple imputer to 1 column fever
def add_simple_imputer(X_train,X_test):
    from sklearn.impute import SimpleImputer
    si = SimpleImputer()
    si.fit(X_train[['fever']])
    X_train_fever = si.transform(X_train[['fever']])
    X_test_fever = si.transform(X_test[['fever']])
    return X_train_fever, X_test_fever
        
# Ordinal encoding on cough column
def ordinal_encoder(X_train,X_test):
    from sklearn.preprocessing import OrdinalEncoder
    oe = OrdinalEncoder(categories=[['Mild','Strong']])
    oe.fit(X_train[['cough']])
    X_train_cough = oe.transform(X_train[['cough']])
    X_test_cough = oe.transform(X_test[['cough']])
    return X_train_cough, X_test_cough
       
# One Hot Encoding on columns gender,city
def one_hot_encoders(X_train,X_test):
    from sklearn.preprocessing import OneHotEncoder
    ohe = OneHotEncoder(drop='first',sparse= False)
    ohe.fit(X_train[['gender','city']])   
    X_train_gender_city = ohe.transform(X_train[['gender','city']])
    X_test_gender_city = ohe.transform(X_test[['gender','city']])
    return X_train_gender_city, X_test_gender_city

# extract age column
def extract_age(X_train,X_test):
    X_train_age = X_train.drop(columns=['gender','fever','cough','city']).values
    X_test_age = X_test.drop(columns=['gender','fever','cough','city']).values
    return X_train_age,X_test_age

#concatenate all the columns
def concatenate_all_column(X_train_age,X_train_fever,X_train_gender_city,X_train_cough, X_test_age,X_test_fever,X_test_gender_city,X_test_cough):
    X_train_transformed = np.concatenate((X_train_age,X_train_fever,X_train_gender_city,X_train_cough),axis=1)
    X_test_transformed = np.concatenate((X_test_age,X_test_fever,X_test_gender_city,X_test_cough),axis=1)
    return X_train_transformed, X_test_transformed


#direct way to transform data by Column Transformer
def column_Transformer(X_train,X_test):
    from sklearn.compose import ColumnTransformer
    from sklearn.impute import SimpleImputer
    from sklearn.preprocessing import OrdinalEncoder
    from sklearn.preprocessing import OneHotEncoder
    trans = ColumnTransformer(transformers = [('t1',SimpleImputer(),['fever']),
                                              ('t2',OrdinalEncoder(categories=[['Mild','Strong']]),['cough']),
                                              ('t3',OneHotEncoder(drop='first', sparse=False),['gender','city'])
                                             ],remainder = 'passthrough' )
    trans.fit(X_train)
    new_X_train = trans.transform(X_train)
    new_X_test = trans.transform(X_test)
    return new_X_train, new_X_test


data = read_csv('covid_toy.csv')
print("Shape of actual Data : ", data.shape)
print("Columns of actual Data : ", data.columns)

X_train,X_test,Y_train,Y_test = split_data(data)

#normal way to do simple imputer, ordinal encoding, one hot encoding in one data frame and merge it. This process is long.
X_train_fever, X_test_fever = add_simple_imputer(X_train,X_test)
X_train_cough, X_test_cough = ordinal_encoder(X_train,X_test)
X_train_gender_city, X_test_gender_city = one_hot_encoders(X_train,X_test)
X_train_age, X_test_age = extract_age(X_train,X_test)
X_train_transformed, X_test_transformed = concatenate_all_column(X_train_age,X_train_fever,X_train_gender_city,X_train_cough,
                                                                X_test_age,X_test_fever,X_test_gender_city,X_test_cough)
print("Long way Data transform successufully")


#direct way to transform data by Column Transformer
new_X_train, new_X_test = column_Transformer(X_train,X_test)
print("Direct way Data transform successufully")

Shape of actual Data :  (100, 6)
Columns of actual Data :  Index(['age', 'gender', 'fever', 'cough', 'city', 'has_covid'], dtype='object')
Long way Data transform successufully
Direct way Data transform successufully
