# Pipeline

## Libraries

In [374]:
#general use and reading in the data
import os
import numpy as np
import pandas as pd

#train test split
from sklearn.model_selection import train_test_split

#pipeline
from sklearn.pipeline import Pipeline

#transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

#GridSearchCV
from sklearn.model_selection import GridSearchCV

#models
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

## 1. Reading in the data

In [375]:
pwd = os.getcwd()
data = os.path.join(pwd, "data.csv")
df = pd.read_csv(data)

## 2. Features and Target Selection

In [376]:
features = df[["Pclass", "Sex", "Fare"]]
target = df[["Survived"]]

## 3. Spliting Data

In [377]:
X_train, X_test, y_train, y_test = train_test_split(features,target, stratify=target, random_state=0)

## 4. Preprocessing and Pipeline

- Pipeline: during the call to *Pipline.fit*, the pipeline calls *fit* and then *transform* on each step in turn, with the input given by the output of the *transform* method of the previous step.
- for the last step in the pipeline. just *fit* is called

https://towardsdatascience.com/pipeline-columntransformer-and-featureunion-explained-f5491f815f

https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

In [378]:
#preprocessing 
#step 1: define categorical and numerical columns
categorical = ["Pclass", "Sex"]
numerical = ["Fare"]

#step 2: create pipeline for grouped columns
#categorical pipeline
cat_pipe = Pipeline([
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse=False))
])

#numerical pipeline
num_pipe = Pipeline([
    ("scaler", StandardScaler())
])

#columntransformer, binding grouped columns together
preprocessor = ColumnTransformer(
    transformers=[
        ("cat_pipe", cat_pipe, categorical),
        ("num_pipe", num_pipe, numerical)
    ]
)

#pipeline
pipe = Pipeline(
    steps=[
        ("preprocessor",preprocessor),
        ("classifier", SVC())
    ]
)

## 4.1 Accessing steps in pipeline

In [389]:
pipe.steps

[('preprocessor',
  ColumnTransformer(transformers=[('cat_pipe',
                                   Pipeline(steps=[('encoder',
                                                    OneHotEncoder(handle_unknown='ignore',
                                                                  sparse=False))]),
                                   ['Pclass', 'Sex']),
                                  ('num_pipe',
                                   Pipeline(steps=[('scaler', StandardScaler())]),
                                   ['Fare'])])),
 ('classifier', SVC())]

In [392]:
pipe.steps[0][1]

ColumnTransformer(transformers=[('cat_pipe',
                                 Pipeline(steps=[('encoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False))]),
                                 ['Pclass', 'Sex']),
                                ('num_pipe',
                                 Pipeline(steps=[('scaler', StandardScaler())]),
                                 ['Fare'])])

In [379]:
# pipe.fit(X_train, np.ravel(y_train))
# pipe.score(X_test, y_test)

## 5. Grid Search CV
1. parameters
2. GridSearchCV
3. .fit, .best_params_, .best_score_, .score(X_test, y_test)

In [380]:
param_grid = [
    #svc
    {
        "classifier":[SVC()],
        "preprocessor":[preprocessor],
        "classifier__C":[0.001, 0.01, 0.1, 1, 10, 100],
        "classifier__gamma":[0.001, 0.01, 0.1, 1, 10, 100]
    },
    #knn
    {
        "classifier":[KNeighborsClassifier()],
        "preprocessor":[preprocessor],
        "classifier__n_neighbors":[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
    },
    #logreg
    {
        "classifier":[LogisticRegression()],
        "preprocessor":[preprocessor],
        "classifier__C":[0.001, 0.01, 0.1, 1, 10, 100]
    }
]

In [381]:
grid = GridSearchCV(pipe, param_grid=param_grid, cv=5)
grid.fit(X_train, np.ravel(y_train))

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('cat_pipe',
                                                                         Pipeline(steps=[('encoder',
                                                                                          OneHotEncoder(handle_unknown='ignore',
                                                                                                        sparse=False))]),
                                                                         ['Pclass',
                                                                          'Sex']),
                                                                        ('num_pipe',
                                                                         Pipeline(steps=[('scaler',
                                                                                          StandardScaler())]),
                         

In [398]:
print(
    "Test-set score: {:5f}\n".format(grid.score(X_test, y_test))
)
print(
    "Best cross-validaiton score: {:.5f}".format(grid.best_score_)
)
print(
    "Best params:\n{}\n".format(grid.best_params_)
)
print(
    "Best best_estimator_: {}".format(grid.best_estimator_)
)

Test-set score: 0.780269

Best cross-validaiton score: 0.81430
Best params:
{'classifier': SVC(C=100, gamma=0.1), 'classifier__C': 100, 'classifier__gamma': 0.1, 'preprocessor': ColumnTransformer(transformers=[('cat_pipe',
                                 Pipeline(steps=[('encoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False))]),
                                 ['Pclass', 'Sex']),
                                ('num_pipe',
                                 Pipeline(steps=[('scaler', StandardScaler())]),
                                 ['Fare'])])}

Best best_estimator_: Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat_pipe',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore',
                  