# ML Pipeline (Putting it all together)
----
- Sequentially apply a list of transforms( remove NaNs,Convert into standard format,imputer..etc) and a final estimator(ML algorithm)
- Purpose of the pipeline is to **assemble** several steps in one order



## Step 1: Import necessary modules

In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [3]:
Pipeline?

## Step 2: Import Data

In [4]:
df = pd.read_csv("house-votes-84.csv")
df.head()

Unnamed: 0,Class Name,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,synfuels-corporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
0,republican,n,y,n,y,y,y,n,n,n,y,?,y,y,y,n,y
1,republican,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,?
2,democrat,?,y,y,?,y,y,n,n,n,n,y,n,y,y,n,n
3,democrat,n,y,y,n,?,y,n,n,n,n,y,n,y,n,n,y
4,democrat,y,y,y,n,y,y,n,n,n,n,y,?,y,y,y,y


In [3]:
df=df.replace('y', 1)
df.head()

Unnamed: 0,Class Name,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,synfuels-corporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
0,republican,n,1,n,1,1,1,n,n,n,1,?,1,1,1,n,1
1,republican,n,1,n,1,1,1,n,n,n,n,n,1,1,1,n,?
2,democrat,?,1,1,?,1,1,n,n,n,n,1,n,1,1,n,n
3,democrat,n,1,1,n,?,1,n,n,n,n,1,n,1,n,n,1
4,democrat,1,1,1,n,1,1,n,n,n,n,1,?,1,1,1,1


In [0]:
df=df.replace('n', 0)
df.head()

Unnamed: 0,Class Name,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,synfuels-corporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
0,republican,0,1,0,1,1,1,0,0,0,1,?,1,1,1,0,1
1,republican,0,1,0,1,1,1,0,0,0,0,0,1,1,1,0,?
2,democrat,?,1,1,?,1,1,0,0,0,0,1,0,1,1,0,0
3,democrat,0,1,1,0,?,1,0,0,0,0,1,0,1,0,0,1
4,democrat,1,1,1,0,1,1,0,0,0,0,1,?,1,1,1,1


In [0]:
df[df == '?'] = np.nan

In [0]:
df.head()

Unnamed: 0,Class Name,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,synfuels-corporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
0,republican,0.0,1,0,1.0,1.0,1,0,0,0,1,,1.0,1,1,0,1.0
1,republican,0.0,1,0,1.0,1.0,1,0,0,0,0,0.0,1.0,1,1,0,
2,democrat,,1,1,,1.0,1,0,0,0,0,1.0,0.0,1,1,0,0.0
3,democrat,0.0,1,1,0.0,,1,0,0,0,0,1.0,0.0,1,0,0,1.0
4,democrat,1.0,1,1,0.0,1.0,1,0,0,0,0,1.0,,1,1,1,1.0


In [0]:
y = df['Class Name'].values
type(y)

numpy.ndarray

In [0]:
X = (df[df.columns[[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]]].values)
type(X)

numpy.ndarray

In [0]:
X.shape

(435, 16)

In [0]:
X[:5]

array([[0, 1, 0, 1, 1, 1, 0, 0, 0, 1, nan, 1, 1, 1, 0, 1],
       [0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, nan],
       [nan, 1, 1, nan, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0],
       [0, 1, 1, 0, nan, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1],
       [1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, nan, 1, 1, 1, 1]], dtype=object)

## Step 3: Divide into train and test

In [0]:
# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=42)

## Step 4: Create Pipeline and fit the model

In [2]:
Imputer?

In [0]:
# Setup the pipeline steps: steps
steps = [('imputation', Imputer(missing_values='NaN', strategy='most_frequent', axis=0)),
        ('SVM', SVC())]

In [0]:
# Create the pipeline: pipeline
pipeline = Pipeline(steps)

In [0]:
# Fit the pipeline to the train set
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('imputation', Imputer(axis=0, copy=True, missing_values='NaN', strategy='most_frequent',
    verbose=0)), ('SVM', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

## Step5: Predict Pipeline

In [0]:
# Predict the labels of the test set
y_pred = pipeline.predict(X_test)

In [0]:
y_pred[:5]

array(['democrat', 'democrat', 'republican', 'republican', 'republican'],
      dtype=object)

## Step 6: Observe Metrics

In [0]:
accuracy_score(y_test,y_pred)

0.9694656488549618

In [0]:
confusion_matrix(y_test,y_pred)

array([[82,  3],
       [ 1, 45]], dtype=int64)

In [0]:
# Compute metrics
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

   democrat       0.99      0.96      0.98        85
 republican       0.94      0.98      0.96        46

avg / total       0.97      0.97      0.97       131



https://towardsdatascience.com/a-simple-example-of-pipeline-in-machine-learning-with-scikit-learn-e726ffbb6976