# Managing ML Workflow with the help of Pipelines

In [7]:
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")
import pickle   #### Store Models 


In [2]:
#iris.target

In [15]:
# Load and split the data
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, 
                                                    test_size= 0.2,random_state=42 )
print(X_train.shape)
print(X_test.shape)



(120, 4)
(30, 4)


In [16]:
print(y_train.shape)
print(y_test.shape)

(120,)
(30,)


In [17]:
iris.data

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [18]:
from sklearn.decomposition import PCA

pca = PCA(n_components=3)   #Def

pca.fit(X_train)

x_pca = pca.transform(X_test)



In [20]:
x_pca[:5]

array([[ 0.96873884, -0.16191895, -0.23967777],
       [-2.15827783,  0.87651712, -0.12969891],
       [ 3.84073027,  0.29102786, -0.51472332],
       [ 0.85922818, -0.14689823,  0.02494165],
       [ 1.37724773,  0.26723591, -0.52818204]])

In [21]:
pca.explained_variance_ratio_  

array([0.91959926, 0.05714377, 0.01838378])

In [22]:
pca.components_

array([[ 0.35922175, -0.08867716,  0.85800365,  0.35626652],
       [ 0.66072216,  0.72708635, -0.16688652, -0.08331002],
       [-0.5781132 ,  0.60569412,  0.08006791,  0.54084091]])

## Pipeline


<img src="https://khashtamov.com/uploads/data-pipeline-schema.jpg" alt="Pipeline">

Ref : https://khashtamov.com/uploads/data-pipeline-schema.jpg

In [23]:

# Construct  pipelines

## Build Pipeline object for following functionalities
## 1. StandarScalar --> PCA --> LR classifier 



pipe_lr =  Pipeline([("scl",StandardScaler())
                     ,("pca",PCA(n_components=2)),
                     ("clf",LogisticRegression(random_state=42))])


pipe_lr

Pipeline(steps=[('scl', StandardScaler()), ('pca', PCA(n_components=2)),
                ('clf', LogisticRegression(random_state=42))])

In [25]:
### How to fit pipe object and evaluate the score 


## Fit pipe object

### Write your code here---------------------->

pipe_lr.fit(X_train,y_train)



## Evaluate accuracy between actual vs predicted

print("Accuracy of LR:",accuracy_score(y_test,pipe_lr.predict(X_test)))
      
      

Accuracy of LR: 0.9


#### In-Class Assignment

In [33]:
pipe_transform= Pipeline([("scl",StandardScaler())
                     ,("pca",PCA(n_components=2))])

pipe_model= Pipeline([("clf",SVC(random_state=42))])

In [35]:
pipeline_model_building = [pipe_transform,Pipeline([("clf",SVC(random_state=42))])]

In [26]:
# Class assignment 
## Build Pipeline object for following functionalities
## 1. StandarScalar --> PCA --> SVC
## 2. StandarScalar --> PCA --> DecisionTreeClassifier

pipe_svm= Pipeline([("scl",StandardScaler())
                     ,("pca",PCA(n_components=2)),
                     ("clf",SVC(random_state=42))])


pipe_dt= Pipeline([("scl",StandardScaler())
                     ,("pca",PCA(n_components=2)),
                     ("clf",DecisionTreeClassifier(random_state=42))])



In [29]:
pipe_svm

Pipeline(steps=[('scl', StandardScaler()), ('pca', PCA(n_components=2)),
                ('clf', SVC(random_state=42))])

#### Iterate through pipelines

In [36]:
# List of pipelines for ease of iteration

pipelines = [pipe_lr, pipe_svm, pipe_dt]

# Dictionary of pipelines and classifier types for ease of reference
pipe_dict = {0: 'Logistic Regression', 1: 'Support Vector Machine', 2: 'Decision Tree'}


# Fit the pipelines


### Write your code here

for pipe in pipelines:
    pipe.fit(X_train,y_train)


In [38]:
for idx, val in enumerate(pipelines):
    print(idx , val)

0 Pipeline(steps=[('scl', StandardScaler()), ('pca', PCA(n_components=2)),
                ('clf', LogisticRegression(random_state=42))])
1 Pipeline(steps=[('scl', StandardScaler()), ('pca', PCA(n_components=2)),
                ('clf', SVC(random_state=42))])
2 Pipeline(steps=[('scl', StandardScaler()), ('pca', PCA(n_components=2)),
                ('clf', DecisionTreeClassifier(random_state=42))])


In [40]:

# Compare accuracies
for idx, val in enumerate(pipelines):
    print('%s pipeline test accuracy: %.3f' %(pipe_dict[idx], accuracy_score(y_test,val.predict(X_test))) )

Logistic Regression pipeline test accuracy: 0.900
Support Vector Machine pipeline test accuracy: 0.900
Decision Tree pipeline test accuracy: 0.867


In [42]:
# Compare accuracies
for idx, val in enumerate(pipelines):
    print('%s pipeline test accuracy: %.3f' %(pipe_dict[idx], val.score(X_test, y_test)) )

Logistic Regression pipeline test accuracy: 0.900
Support Vector Machine pipeline test accuracy: 0.900
Decision Tree pipeline test accuracy: 0.867


In [41]:
### Score for pipe_dt


pipe_dt.score(X_test, y_test)

0.8666666666666667

# Saving Models

In [43]:
import pickle

## Create model and save it for future use

model=pipe_lr.fit(X_train,y_train)

### Save model using pickle


### Write your code here---------------------->

pickle.dump(model ,open("model.pkl" , "wb")  )  ## wb - write binary

In [44]:
## Loading saved model and predict for new data 

### Write your code here---------------------->


# load pickel model

model_loaded=  pickle.load(open("model.pkl" , "rb"))    ## rb - read binary



print("Prediction on X_test: ",model_loaded.predict(X_test))

Prediction on X_test:  [1 0 2 1 2 0 1 2 2 1 2 0 0 0 0 1 2 1 1 2 0 1 0 2 2 2 2 2 0 0]
