## In this practice notebook, we will be learning how to Implement Sklearn Pipeline

   
   * Pipelines are very handy and useful when it comes to deploying models in productions

## Import the required libraries

In [None]:
!python -m pip install pip --upgrade --user -q
!python -m pip install numpy pandas seaborn matplotlib scipy statsmodels sklearn scikit-image --user -q

In [None]:
import IPython
IPython.Application.instance().kernel.do_shutdown(True)

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [None]:
iris_df=load_iris()

In [None]:
iris_df.data.shape

In [None]:
X_train,X_test,y_train,y_test=train_test_split(iris_df.data,iris_df.target,test_size=0.3,random_state=0)


## Build a pipeline

In [None]:
pipeline_lr = Pipeline([('scalar1',StandardScaler()),
                        ('pca1',PCA(n_components=2)),
                        ('lr_classifier',LogisticRegression(random_state=0))])

In [None]:
model = pipeline_lr.fit(X_train, y_train)

In [None]:
model.score(X_test,y_test)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

pipeline_knn = Pipeline([('scalar4',StandardScaler()),
                         ('pca4',PCA(n_components=2)),
                         ('knn_classifier',KNeighborsClassifier())])

In [None]:
from sklearn import svm

pipe_svm = Pipeline([('scl', StandardScaler()),
                     ('pca', PCA(n_components=2)),
                     ('clf', svm.SVC(random_state=42))])

In [None]:
pipeline_dt = Pipeline([('scalar2',StandardScaler()),
                        ('pca2',PCA(n_components=2)),
                        ('dt_classifier',DecisionTreeClassifier())])

In [None]:
pipelines = [pipeline_lr, pipeline_dt, pipe_svm,pipeline_knn]

pipe_dict = {0: 'Logistic Regression', 1: 'Decision Tree', 2:'SVM', 3:'KNN'}

for pipe in pipelines:
    pipe.fit(X_train, y_train)

In [None]:
for i,model in enumerate(pipelines):
    print("{} Test Accuracy: {}".format(pipe_dict[i],model.score(X_test,y_test)))

## Implement Grid Search with Pipeline

In [None]:
from sklearn.model_selection import GridSearchCV
import numpy as np

# Create a pipeline
pipe = Pipeline([("classifier", RandomForestClassifier())])

# Create dictionary with candidate learning algorithms and their hyperparameters
grid_param = [{"classifier": [LogisticRegression()],
               "classifier__penalty": ['l2','l1'],
               "classifier__C": np.logspace(0, 4, 10)
              },
              {"classifier": [LogisticRegression()],
               "classifier__penalty": ['l2'],
               "classifier__C": np.logspace(0, 4, 10),
               "classifier__solver":['newton-cg','saga','sag','liblinear'] ##This solvers don't allow L1 penalty
              },
              {"classifier": [RandomForestClassifier()],
               "classifier__n_estimators": [10, 100, 1000],
               "classifier__max_depth":[5,8,15,25,30,None],
               "classifier__min_samples_leaf":[1,2,5,10,15,100],
               "classifier__max_leaf_nodes": [2, 5,10]}]

# create a gridsearch of the pipeline, the fit the best model
gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=0,n_jobs=-1) # Fit grid search
best_model = gridsearch.fit(X_train,y_train)

In [None]:
print(best_model.best_estimator_)

print("\n\n The mean accuracy of the model is:", best_model.score(X_test,y_test))

In [None]:
from sklearn.pipeline import make_pipeline

# Create a pipeline

pipe = make_pipeline((RandomForestClassifier()))

# Create dictionary with candidate learning algorithms and their hyperparameters
grid_param = [{"randomforestclassifier": [RandomForestClassifier()],
               "randomforestclassifier__n_estimators": [10, 100, 1000],
               "randomforestclassifier__max_depth":[5,8,15,25,30,None],
               "randomforestclassifier__min_samples_leaf":[1,2,5,10,15,100],
               "randomforestclassifier__max_leaf_nodes": [2, 5,10]
              }]

# create a gridsearch of the pipeline, the fit the best model
gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=0,n_jobs=-1) # Fit grid search
best_model = gridsearch.fit(X_train,y_train)

best_model.score(X_test,y_test)