# Sklearn data pipelines
---

In [1]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

iris = load_iris()

X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, test_size=0.2, random_state=42
)

### Creation of Pipeline
#### Inside our pipeline
1. Data preprocessing using StandardScalar
2. Dimensionality reduction using PCA
3. Classification

In [2]:
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [3]:
# 1st classification algorithm (Decision Tree)
pipeline_dt = Pipeline(
    [
        ("scalar2", StandardScaler()),
        ("pca2", PCA(n_components=2)),
        ("dt_classifier", DecisionTreeClassifier()),
    ]
)

In [4]:
# 2nd classification algorithm (KNN)
pipeline_knn = Pipeline(
    [
        ("scalar1", StandardScaler()),
        ("pca1", PCA(n_components=2)),
        ("knn_classifier", KNeighborsClassifier()),
    ]
)

In [5]:
# Create an array of pipelines
pipelines = [pipeline_dt, pipeline_knn]

# Dictionary of pipelines and classifier types for ease of reference
pipe_dict = {0: "Decision Tree", 1: "KNN"}

In [6]:
for pipe in pipelines:
    pipe.fit(X_train, y_train)

In [7]:
for i, model in enumerate(pipelines):
    print(pipe_dict[i], "Test Accuracy: ", model.score(X_test, y_test))

Decision Tree Test Accuracy:  0.9
KNN Test Accuracy:  0.9333333333333333
