# Selecting Best Model When Preprocessing.

Include preprocessing with their own parameters.

# Feature Union

Allows us to create a pipeline to combine multiple preprocessing actions properly.

In [34]:
# Load libraries

import numpy as np
from sklearn import datasets
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [35]:
# Set Random Seed 

np.random.seed(0)

In [36]:
# Load data

iris = datasets.load_iris()

# Create Feature matrix and target vector

features = iris.data
target = iris.target

In [37]:
# Create a Preprocessing object that includes StandardScaler, features and PCA

preprocess = FeatureUnion([("std", StandardScaler()), ("pca", PCA())])

In [38]:
pipe = Pipeline([("preprocess", preprocess),
                 ("classifier", LogisticRegression(max_iter = 1000, solver = "liblinear"))])

Some preprocessing methods have their own parameters, which often have to be supplied by the user.

Scikit Learn make this easy by introducing candidates component values in the search space. They are treated as any other hyperparameter to be seached over.

features__n__components : [1,2,3] to discover if 1,2 or 3 is the best option to Principal Components (PCA)
    




In [39]:
# Create space of candidates values 

search_space = [{"preprocess__pca__n_components": [1,2,3],
                 "classifier__penalty":["l1", "l2"],
                 "classifier__C": np.logspace(0,4,10)}]

# Dictionary for Random Forest Hyperparameters

search_space

[{'preprocess__pca__n_components': [1, 2, 3],
  'classifier__penalty': ['l1', 'l2'],
  'classifier__C': array([1.00000000e+00, 2.78255940e+00, 7.74263683e+00, 2.15443469e+01,
         5.99484250e+01, 1.66810054e+02, 4.64158883e+02, 1.29154967e+03,
         3.59381366e+03, 1.00000000e+04])}]

Grid Search uses Cross Validation to determine which model has the highest performance. However, we cant preprocess data and run Grid Search. Thats why we are using FeatureUnion

In [40]:
# Create GridSearch

gridsearch = GridSearchCV(pipe, search_space, cv=5, verbose=0)
gridsearch

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('preprocess',
                                        FeatureUnion(n_jobs=None,
                                                     transformer_list=[('std',
                                                                        StandardScaler(copy=True,
                                                                                       with_mean=True,
                                                                                       with_std=True)),
                                                                       ('pca',
                                                                        PCA(copy=True,
                                                                            iterated_power='auto',
                                                                            n_components=None,
                                                       

In [41]:
# Fit Grid Search

best_model = gridsearch.fit(features,target)

After the search is completed we can use best_estimator_ to view best model's learning algorithm and hyperparameters

In [42]:
# See the hyperparameters of the best model

print("Best Model: ", best_model.best_estimator_.get_params()["preprocess__pca__n_components"])


Best Model:  1
