# Pipeline

https://towardsdatascience.com/pipeline-columntransformer-and-featureunion-explained-f5491f815f

https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

## Libraries

In [233]:
#general use and reading in the data
import os
import numpy as np
import pandas as pd

#spliting data
from sklearn.model_selection import train_test_split

#pipeline
from sklearn.pipeline import Pipeline

#transformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

#Feature Union
from sklearn.pipeline import FeatureUnion


#preprocessing

#models
from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

#GridSearchCV
from sklearn.model_selection import GridSearchCV


## 1. Reading in the data
## 2. Features and target selection

In [234]:
pwd = os.getcwd()
data = os.path.join(pwd, "data.csv")
df = pd.read_csv(data)


In [235]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


## 3. X, y selection and Spliting Data

In [236]:
features = df[["Pclass", "Sex", "Fare"]]
target = df[["Survived"]]

X_train, X_test, y_train, y_test = train_test_split(features,target, stratify=target)

## 4. Preprocessing and Pipeline

- Pipeline: during the call to *Pipline.fit*, the pipeline calls *fit* and then *transform* on each step in turn, with the input given by the output of the *transform* method of the previous step.
- for the last step in the pipeline. just *fit* is called

In [237]:
categorical = ["Pclass", "Sex"]
numerical = ["Fare"]

#categorical pipeline
cat_pipe = Pipeline([
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse=False))
])

#numerical pipeline
num_pipe = Pipeline([
    ("scaler", StandardScaler())
])

#ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("cat_pipe", cat_pipe, categorical),
        ("num_pipe", num_pipe, numerical)
        ]
)

#pipline preprocessor + model
pipe = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("svm", SVC())
        #("dummy", DummyClassifier())
        #("logreg", LogisticRegression())
        #("knn", KNeighborsClassifier())
    ]
)

In [238]:
# pipe.fit(X_train, np.ravel(y_train))
# pipe.score(X_test, y_test)

In [239]:
param_grid = {
    "svm__C":[0.001, 0.01, 0.1, 1, 10, 100],
    "svm__gamma":[0.001, 0.01, 0.1, 1, 10, 100]
}

grid_svm = GridSearchCV(pipe, param_grid=param_grid, cv=5)
grid_svm.fit(X_train, np.ravel(y_train))

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('cat_pipe',
                                                                         Pipeline(steps=[('encoder',
                                                                                          OneHotEncoder(handle_unknown='ignore',
                                                                                                        sparse=False))]),
                                                                         ['Pclass',
                                                                          'Sex']),
                                                                        ('num_pipe',
                                                                         Pipeline(steps=[('scaler',
                                                                                          StandardScaler())]),
                         

In [240]:
print(
    "\nTest set score: {:.2f}\n".format(grid_svm.score(X_test, y_test))
)
print(
    "Best cross-validation score: {:.2f}".format(grid_svm.best_score_)
)
print(
    "Best parameters: {}".format(grid_svm.best_params_)
)
print(
    "Best best_estimator_: {}".format(grid_svm.best_estimator_.named_steps["svm"])
)


Test set score: 0.84

Best cross-validation score: 0.80
Best parameters: {'svm__C': 100, 'svm__gamma': 0.1}
Best best_estimator_: SVC(C=100, gamma=0.1)


In [241]:
pipe.steps[0][1]

ColumnTransformer(transformers=[('cat_pipe',
                                 Pipeline(steps=[('encoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False))]),
                                 ['Pclass', 'Sex']),
                                ('num_pipe',
                                 Pipeline(steps=[('scaler', StandardScaler())]),
                                 ['Fare'])])

In [242]:
from sklearn.datasets import load_breast_cancer

cancer = load_breast_cancer()

In [243]:
pipe2 = Pipeline([
    ("preprocessing", StandardScaler()),
    ("classifier", RandomForestClassifier())
])

In [244]:
param_grid2 = [
    {"classifier":[SVC()],"preprocessing":[StandardScaler(),None], "classifier__gamma":[0.001, 0.01, 0.1, 1, 10, 100],"classifier__C":[0.001, 0.01, 0.1, 1, 10, 100]},
    {"classifier":[RandomForestClassifier(n_estimators=100)],"preprocessing":[None],"classifier__max_features":[1,2,3]}
]

In [245]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(cancer.data, cancer.target, random_state=0)

In [246]:
grid2 =GridSearchCV(pipe2, param_grid=param_grid2, cv=5)
grid2.fit(X_train2, y_train2)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessing', StandardScaler()),
                                       ('classifier',
                                        RandomForestClassifier())]),
             param_grid=[{'classifier': [SVC(C=10, gamma=0.01)],
                          'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
                          'classifier__gamma': [0.001, 0.01, 0.1, 1, 10, 100],
                          'preprocessing': [StandardScaler(), None]},
                         {'classifier': [RandomForestClassifier()],
                          'classifier__max_features': [1, 2, 3],
                          'preprocessing': [None]}])

In [247]:
print(
    "Best params:\n{}\n".format(grid2.best_params_)
)
print(
    "Best cross-validaiton score: {:.2f}".format(grid2.best_score_)
)
print(
    "Test-set score: {:2f}".format(grid.score(X_test, y_test))
)

Best params:
{'classifier': SVC(C=10, gamma=0.01), 'classifier__C': 10, 'classifier__gamma': 0.01, 'preprocessing': StandardScaler()}

Best cross-validaiton score: 0.99
Test-set score: 0.838565
