In [1]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

  return f(*args, **kwds)


In [2]:
iris_df = load_iris()

In [4]:
type(iris_df)

sklearn.utils.Bunch

In [5]:
# split the data in train and test 
X_train, X_test, y_train, y_test = train_test_split(iris_df.data,
                                                   iris_df.target, test_size = 0.3, random_state=0)

In [6]:
## Pipelines creation
# 1. Data Preprocessing by using standard Scaler
# 2. Reduce Dimension using PCA
# 3. Apply Classifier

In [11]:
pipeline_lr = Pipeline([('scalar1', StandardScaler()),
                        ('pca1', PCA(n_components=2)),
                        ('lr_classifier', LogisticRegression(random_state=0))])


                         

In [8]:
pipeline_dt=Pipeline([('scalar2', StandardScaler()),
                     ('pca2', PCA(n_components=2)),
                     ('dt_classifier', DecisionTreeClassifier())])


In [9]:
pipeline_ramdonforest=Pipeline([('scalar3', StandardScaler()),
                               ('pca3', PCA(n_components=2)),
                               ('rf_classifier', RandomForestClassifier())])

In [12]:
## LEts make the list of pipelines
pipelines = [pipeline_lr, pipeline_dt, pipeline_ramdonforest]

In [10]:
best_accuracy = 0.0
best_classifier= 0
best_pipeline =""

In [21]:
# Dictionary of Pipelines and classifier types for ease of reference
pipe_dict= {0 : 'Logistic Regression',
             1 : 'Decision Tree',
             2 : 'RandomForest'}

# fit the pipelines.
for pipe in pipelines:
    pipe.fit(X_train, y_train)




In [22]:
for i, model, in enumerate(pipelines):
    print("{} Test Accuracy : {}".format(pipe_dict[i], model.score(X_test, y_test)))

Logistic Regression Test Accuracy : 0.8666666666666667
Decision Tree Test Accuracy : 0.9111111111111111
RandomForest Test Accuracy : 0.9111111111111111


In [23]:
for i, model in enumerate(pipelines):
    if model.score(X_test, y_test) > best_accuracy:
        best_accuracy = model.score(X_test, y_test)
        best_pipeline = model
        best_classifier = i

print('Classifier with best accuracy : {}'.format(pipe_dict[i]))
print('best Score is : {}'.format(best_accuracy))
    

Classifier with best accuracy : RandomForest
best Score is : 0.9111111111111111


## Pipelines Perform Hyperparameter Tuning Using Grid SearchCV

In [26]:
from sklearn.model_selection import GridSearchCV
import numpy as np

In [30]:
# Create a pipeline
pipe = Pipeline([("classifier", RandomForestClassifier())])

#Create dictionary with candidate learning algorithms and their hyperparameter

grid_param = [
                {"classifier": [LogisticRegression()],
                 "classifier__penalty": ['l2','l1'],
                 "classifier__C": np.logspace(0, 4, 10)
                 },
                {"classifier": [LogisticRegression()],
                 "classifier__penalty": ['l2'],
                 "classifier__C": np.logspace(0, 4, 10),
                 "classifier__solver":['newton-cg','saga','sag','liblinear'] ##This solvers don't allow L1 penalty
                 },
                {"classifier": [RandomForestClassifier()],
                 "classifier__n_estimators": [10, 100, 1000],
                 "classifier__max_depth":[5,8,15,25,30,None],
                 "classifier__min_samples_leaf":[1,2,5,10,15,100],
                 "classifier__max_leaf_nodes": [2, 5,10]}]

In [31]:
# create a gridsearch of the pipeline, the fit the best model
gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=0, n_jobs= -1)
best_model = gridsearch.fit(X_train, y_train)



In [32]:
print(best_model.best_estimator_)
print("The mean accuracy of the model is : ", best_model.score(X_test, y_test))

Pipeline(memory=None,
     steps=[('classifier', LogisticRegression(C=59.94842503189409, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=None,
          solver='warn', tol=0.0001, verbose=0, warm_start=False))])
The mean accuracy of the model is :  0.9555555555555556


## Make Pipelines in SKLearn

In [37]:
from sklearn.pipeline import make_pipeline

In [38]:
# Create a pipeline
pipe = make_pipeline((RandomForestClassifier()))

#Create dictionary with candidate learning algorithm
#and their hyperparameters

grid_param = [ 
    {"randomforestclassifier" : [RandomForestClassifier()],
     "randomforestclassifier__n_estimators" :[10, 100, 1000],
     "randomforestclassifier__max_depth": [5, 8, 15, 25, 30, None],
     "randomforestclassifier__max_leaf_nodes" : [2, 5, 10]}]

In [40]:
gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=0, n_jobs=-1)
best_model = gridsearch.fit(X_train, y_train)

In [41]:
best_model.score(X_test, y_test)

0.9777777777777777