# 1. INTRODUÇÃO

<b>Datasets necessários:</b>
- Credit Data

<b>O que é um Pipeline?<br></b>
Como diz a própria documentação do Scikit-Learn, o pipeline é uma sequência de transformações seguidas de um estimador final. O objetivo é reunir diversas etapas para serem validadas em conjunto ao definir parâmetros diferentes.

Dentre as principais vantagens do pipeline, estão:

- Dar maior legibilidade ao código;
- Facilitar a leitura do código;
- Forçar a execução das transformações na ordem correta;
- Tornar o script mais reproduzível.
- Sendo assim, ao invés de você executar todas as transformações de forma desorganizada ao longo do código, com as funções de pipeline você terá tudo organizado em um pequeno trecho do seu programa.

<b>Estrutura do Pipeline:</b>
<img src = 'pipelines.png'  width="500" height="600">

<b>Sintaxes possíveis:</b>

1 - PIPELINE
```python
pipe = Pipeline([('transformacao_1', transformacao_1()),
                 ('transformacao_2', transformacao_2()),
                 ...
                 ('transformacao_n', transformacao_n()),
                 ('modelo', modelo())])

pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)
```

2 - MAKE_PIPELINE
```python
pipe2 = make_pipeline(transformacao_1(), transformacao_2(),
 ... , transformacao_n(),  modelo())
pipe2.fit(X_train, y_train)
pipe2.score(X_test, y_test)
```

3 - PIPELINE + COLUMN_TRANSFORMER
3.1
```python
t = [('num', SimpleImputer(strategy="median"), colunas_numericas),
('cat', SimpleImputer(strategy="most_frequent"), colunas_categoricas)]
transformer = ColumnTransformer(transformers=t)
```

```python
numeric_transformer = Pipeline(
steps=[("median_imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

categorical_transformer = Pipeline(
steps=[('freq_imputer', SimpleImputer(strategy='most_frequent')]
)

# aplica transformador numerico nas colunas 0 e 1 e categorico nas 2 e 3
t = [('num', numeric_transformer, [0, 1]), ('cat', categorical_transformer, [2, 3])]
transformer = ColumnTransformer(transformers=t)
```

# 2. PIPELINE

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

ModuleNotFoundError: No module named 'pandas'

In [None]:
german = pd.read_csv("german_credit_data.csv")

In [None]:
german.head()

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,male,2,own,,little,1169,6,radio/TV,good
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,2,49,male,1,own,little,,2096,12,education,good
3,3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,4,53,male,2,free,little,little,4870,24,car,bad


In [None]:
import numpy as np
german['Risk'] = np.where(german['Risk']=='bad', 1, 0)
german.head()

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,male,2,own,,little,1169,6,radio/TV,0
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,1
2,2,49,male,1,own,little,,2096,12,education,0
3,3,45,male,2,free,little,little,7882,42,furniture/equipment,0
4,4,53,male,2,free,little,little,4870,24,car,1


In [None]:
german = german[['Age', 'Credit amount', 'Duration', 'Risk']].copy()

In [None]:
german.head()

Unnamed: 0,Age,Credit amount,Duration,Risk
0,67,1169,6,0
1,22,5951,48,1
2,49,2096,12,0
3,45,7882,42,0
4,53,4870,24,1


In [None]:
features = german[['Age', 'Credit amount', 'Duration']]
labels = german['Risk']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    features, labels, test_size=0.3, random_state=0
)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier

# Sem o pipeline, vc faria o passo a passo, algo do tipo:
# scaler = StandardScaler()
# scaler.fit_transform(X_train)


pipe = Pipeline([('scaler', StandardScaler()),
                 ('imputer', SimpleImputer(strategy='median')),
                 ('clf', DecisionTreeClassifier())])

pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.61

In [None]:
pipe.predict(X_test)

array([1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0])

# 3. MAKE_PIPELINE

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

make_pipe = make_pipeline(MinMaxScaler(),
                     SimpleImputer(strategy='mean'),
                     LogisticRegression())

make_pipe.fit(X_train, y_train)
make_pipe.score(X_test, y_test)

0.7166666666666667

# 4. COLUMN TRANSFORMER

In [None]:
german = pd.read_csv("german_credit_data.csv")

In [None]:
german['Risk'] = np.where(german['Risk']=='bad', 1, 0)

german = german[['Age', 'Credit amount', 'Duration', 'Purpose','Risk']].copy()

german.head()

Unnamed: 0,Age,Credit amount,Duration,Purpose,Risk
0,67,1169,6,radio/TV,0
1,22,5951,48,radio/TV,1
2,49,2096,12,education,0
3,45,7882,42,furniture/equipment,0
4,53,4870,24,car,1


In [None]:
features = german[['Age', 'Credit amount', 'Duration', 'Purpose']]
labels = german['Risk']

X_train, X_test, y_train, y_test = train_test_split(
    features, labels, test_size=0.3, random_state=0
)

In [None]:
numericas_continuas = ['Age', 'Credit amount', 'Duration']
string_categoricas =['Purpose']

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

t = [('num_continuas', StandardScaler(), numericas_continuas),
     ('str_categoricas', OneHotEncoder(), string_categoricas)]

preprocessor = ColumnTransformer(transformers=t)

In [None]:
pipe_transformer = Pipeline(steps=[('preprocessor', preprocessor), ('clf', DecisionTreeClassifier())])

In [None]:
pipe_transformer.fit(X_train, y_train)

In [None]:
pipe_transformer.score(X_test, y_test)

0.6266666666666667

# 5. LOOP PARA TESTAR VÁRIOS MODELOS

In [None]:
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    AdaBoostClassifier,
    GradientBoostingClassifier,
)
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

In [None]:
classifiers_list = [
    KNeighborsClassifier(3),
    SVC(kernel="rbf", C=0.025, probability=True),
    SVC(),
    LogisticRegression(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    XGBClassifier()

]


In [None]:
for classifier in classifiers_list:
    pipe = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", classifier)])
    pipe.fit(X_train, y_train)
    print(classifier)
    print("model score: %.3f" % pipe.score(X_test, y_test))

KNeighborsClassifier(n_neighbors=3)
model score: 0.690
SVC(C=0.025, probability=True)
model score: 0.713
SVC()
model score: 0.720
LogisticRegression()
model score: 0.697
DecisionTreeClassifier()
model score: 0.613
RandomForestClassifier()
model score: 0.697
AdaBoostClassifier()
model score: 0.720
GradientBoostingClassifier()
model score: 0.703
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              feature_weights=None, gamma=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constra