# Custom Transformer, Feature Union & Pipeline using Scikit Learn

Following illustrate the use of custom transformer, feature union and pipeline use mainly in Scikit-learn.
This provide a fast pipeline for basic data preparation with machine learning.

Example will make use of iris data set and script to perform simple transformation and the classification using linear classifier.
This first part will use pipeline from the Scikit learn.
2nd part will include pipeline with Scikit-learn
To demonstrate the feature union, a custom transformer is done on the columns by simply mulitplying by a scalar on all the columns (dependent columns). Note this does not improve the accuracy due to dependency on the prior columns.

Resources:
http://michelleful.github.io/code-blog//2015/06/20/pipelines/
https://www.kaggle.com/metadist/work-like-a-pro-with-pipelines-and-feature-unions/notebook


In [35]:
# info on the feature union http://michelleful.github.io/code-blog//2015/06/20/pipelines/
# 



import os, sys, datetime
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler
from sklearn import linear_model 
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline, FeatureUnion




In [33]:
# custom transformer
class randomtransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass #pass in column name to extract
                        # may pass in vars to use or don need
        
    def helper_func(self):
        """ Use if needed"""

    def fit(self, X, y = None):
        return self
    
    def transform(self, df):
        for col_name in df.columns:
            df[col_name+ "1"] = df[col_name]*2
        return df

        
# Loading data
data = load_iris()
X = data.data
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


df = pd.DataFrame(data = X_train, columns=data.feature_names)
print(df.head())
print()


#doing min max scaling
scalar = MinMaxScaler()
X_train = scalar.fit_transform(X_train)
X_test = scalar.fit_transform(X_test)

#view in pandas
X_train_df = pd.DataFrame(data = X_train, columns=data.feature_names)
X_test_df = pd.DataFrame(data = X_test, columns=data.feature_names)
print(X_train_df.head())
print

# do a random operation with additional columns to test out pipeline by doing a scaling by 2
# replace this by a custom transformer

# for col_name in X_train_df.columns:
#     X_train_df[col_name+ "1"] = X_train_df[col_name]*2
#     X_test_df[col_name+ "1"] = X_test_df[col_name]*2

# Use custom transformer to do the transform
custom_tform =  randomtransformer()
X_train_df = custom_tform.fit_transform(X_train_df)
X_test_df = custom_tform.fit_transform(X_test_df)
    
print(X_train_df.head())
print()



# doing classifier
logreg = linear_model.LogisticRegression(C= 1e5)
logreg.fit(X_train_df, y_train)


#get accuracy
print(accuracy_score(y_train, logreg.predict(X_train_df)))
print(accuracy_score(y_test, logreg.predict(X_test_df)))





   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.7               2.9                4.2               1.3
1                7.6               3.0                6.6               2.1
2                5.6               3.0                4.5               1.5
3                5.1               3.5                1.4               0.2
4                7.7               2.8                6.7               2.0

   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0           0.411765          0.409091           0.553571          0.500000
1           0.970588          0.454545           0.982143          0.833333
2           0.382353          0.454545           0.607143          0.583333
3           0.235294          0.681818           0.053571          0.041667
4           1.000000          0.363636           1.000000          0.791667
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0       

In [34]:
# trying out same script with pipeline

# Loading data
data = load_iris()
X = data.data
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

X_train_df = pd.DataFrame(data=X_train, columns=data.feature_names)
X_test_df = pd.DataFrame(data=X_test, columns=data.feature_names)

# try out pipeline with out feature union
pipeline = ([
            ("min_max_scale", MinMaxScaler()),
            ("clf", linear_model.LogisticRegression(C= 1e5)),            
            ] )

pipe = Pipeline(pipeline)

pipe.fit(X_train_df, y_train)


#get accuracy
print(accuracy_score(y_train, pipe.predict(X_train_df)))
print(accuracy_score(y_test, pipe.predict(X_test_df)))


0.97
0.98


In [45]:
#not doing this way


# trying out same script with pipeline + feature union

# custom transformer
class randomtransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass #pass in column name to extract
                        # may pass in vars to use or don need
        
    def helper_func(self):
        """ Use if needed"""

    def fit(self, X, y = None):
        return self
    
    def transform(self, df):
        for col_name in df.columns:
            df[col_name+ "1"] = df[col_name]*2
        return df

# Loading data
data = load_iris()
X = data.data
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

X_train_df = pd.DataFrame(data=X_train, columns=data.feature_names)
X_test_df = pd.DataFrame(data=X_test, columns=data.feature_names)

# try out pipeline with out feature union
# pipeline can also take another pipeline
pipeline = ([
             ("feats", FeatureUnion([
                    ("min_max_scale", MinMaxScaler()), 
                    ("random_estimator", randomtransformer()),                            
             ])),
    
            ("clf", linear_model.LogisticRegression(C= 1e5)),            
            ] )

pipe = Pipeline(pipeline)

pipe.fit(X_train_df, y_train)
#During fitting, each of these is fit to the data independently. For transforming data, the transformers are applied in parallel, and the sample vectors they output are concatenated end-to-end into larger vectors.

print(X_train_df.shape)


print(X_train_df.head())
print(X_test_df.head())


pipe.predict(X_train_df)

#get accuracy
#print(accuracy_score(y_train, pipe.predict(X_train_df)))
#print(accuracy_score(y_test, pipe.predict(X_test_df)))#cannot as the X is not being modified???

(100, 8)
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.7               2.9                4.2               1.3   
1                7.6               3.0                6.6               2.1   
2                5.6               3.0                4.5               1.5   
3                5.1               3.5                1.4               0.2   
4                7.7               2.8                6.7               2.0   

   sepal length (cm)1  sepal width (cm)1  petal length (cm)1  \
0                11.4                5.8                 8.4   
1                15.2                6.0                13.2   
2                11.2                6.0                 9.0   
3                10.2                7.0                 2.8   
4                15.4                5.6                13.4   

   petal width (cm)1  
0                2.6  
1                4.2  
2                3.0  
3                0.4  
4               

ValueError: operands could not be broadcast together with shapes (100,8) (4,) (100,8) 

In [55]:
# get intermediate output from pip
#https://stackoverflow.com/questions/48743032/get-intermediate-data-state-in-scikit-learn-pipeline



class Debug(BaseEstimator, TransformerMixin):

    def transform(self, X):
        print(X.shape)
        # what other output you want
        return X

    def fit(self, X, y=None, **fit_params):
        return self

# custom transformer
class randomtransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass #pass in column name to extract
                        # may pass in vars to use or don need
        
    def helper_func(self):
        """ Use if needed"""

    def fit(self, X, y = None):
        return self
    
    def transform(self, df):
        return df.applymap(lambda x: x*2)

# Loading data
data = load_iris()
X = data.data
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

X_train_df = pd.DataFrame(data=X_train, columns=data.feature_names)
X_test_df = pd.DataFrame(data=X_test, columns=data.feature_names)

# try out pipeline with out feature union
# pipeline can also take another pipeline
pipeline = ([
    
            ("debug0", Debug()),
             ("feats", FeatureUnion([
                    ("min_max_scale", MinMaxScaler()), 
                    ("random_estimator", randomtransformer()),                            
             ])),
            ("debug", Debug()),
            ("clf", linear_model.LogisticRegression(C= 1e5)),            
            ] )

pipe = Pipeline(pipeline)

pipe.fit(X_train_df, y_train)
#During fitting, each of these is fit to the data independently. For transforming data, the transformers are applied in parallel, and the sample vectors they output are concatenated end-to-end into larger vectors.

#get accuracy
print(accuracy_score(y_train, pipe.predict(X_train_df)))
print(accuracy_score(y_test, pipe.predict(X_test_df)))#cannot as the X is not being modified???

(100, 4)
(100, 8)
(100, 4)
(100, 8)
0.98
(50, 4)
(50, 8)
0.98


In [52]:
X_train_df.shape



(100, 4)