# Automate Machine Learning Workflows with Pipelines in Python and scikit-learn

Author: Jason Brownlee

Article from [machinelearningmastery](https://machinelearningmastery.com/automate-machine-learning-workflows-pipelines-python-scikit-learn/).

> Note: In this notebook, I am studying the article mentioned above. Some changes may have been made to the code during its implementation.

# Library

In [12]:
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest

# Pipeline 1: Data Preparation and Modeling

## Create a pipeline that standardizes the data then creates a model

In [2]:
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(url, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]

## Create pipeline

In [4]:
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('lda', LinearDiscriminantAnalysis()))
model = Pipeline(estimators)
model

## Evaluate pipeline

In [10]:
seed = 7
kfold = KFold(n_splits=10, random_state=seed, shuffle=True)
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())
results

0.7669685577580315


array([0.80519481, 0.75324675, 0.71428571, 0.79220779, 0.79220779,
       0.77922078, 0.66233766, 0.80519481, 0.82894737, 0.73684211])

# Pipeline 2: Feature Extraction and Modeling

## Create a pipeline that extracts features from the data then creates a model

In [13]:
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(url, names=names)
print(dataframe.head())
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]

   preg  plas  pres  skin  test  mass   pedi  age  class
0     6   148    72    35     0  33.6  0.627   50      1
1     1    85    66    29     0  26.6  0.351   31      0
2     8   183    64     0     0  23.3  0.672   32      1
3     1    89    66    23    94  28.1  0.167   21      0
4     0   137    40    35   168  43.1  2.288   33      1


## Create feature union

In [16]:
features = []
features.append(('pca', PCA(n_components=3)))
features.append(('select_best', SelectKBest(k=6)))
feature_union = FeatureUnion(features)
feature_union

## Create pipeline

In [20]:
estimators = []
estimators.append(('feature_union', feature_union))
estimators.append(('logistic', LogisticRegression(max_iter=10000)))
model = Pipeline(estimators)
model

## Evaluate pipeline

In [21]:
seed = 7
kfold = KFold(n_splits=10, random_state=seed, shuffle=True)
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())
results

0.7721633629528366


array([0.83116883, 0.74025974, 0.74025974, 0.80519481, 0.79220779,
       0.77922078, 0.66233766, 0.80519481, 0.82894737, 0.73684211])