In [None]:
import warnings
warnings.filterwarnings('ignore')

# Scikit-learn Pipelines

## Definição Manual de um Pipeline

### Leitura do dataset

Estamos utilizando o dataset do Titanic, disponível no [kaggle](https://www.kaggle.com/c/titanic/data?select=train.csv),

In [None]:
!pip install gdown



In [None]:
!gdown https://drive.google.com/uc?id=1nUojuf_X8r3MMEpa60PZkes5rk1eCueI

Downloading...
From: https://drive.google.com/uc?id=1nUojuf_X8r3MMEpa60PZkes5rk1eCueI
To: /content/train_titanic.csv
  0% 0.00/61.2k [00:00<?, ?B/s]100% 61.2k/61.2k [00:00<00:00, 64.6MB/s]


In [None]:
import pandas as pd

In [None]:
titanic_data = pd.read_csv('train_titanic.csv')

In [None]:
titanic_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Remover colunas inúteis

In [None]:
titanic_data.drop(["Name", "Ticket", "Cabin"], axis=1, inplace=True)

In [None]:
titanic_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.25,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.925,S
3,4,1,1,female,35.0,1,0,53.1,S
4,5,0,3,male,35.0,0,0,8.05,S


### Verificar valores faltando

In [None]:
print("Missing Values: {}".format(titanic_data.isna().values.sum()))

Missing Values: 179


Listar colunas com valores faltando

In [None]:
titanic_data.isna().any()

PassengerId    False
Survived       False
Pclass         False
Sex            False
Age             True
SibSp          False
Parch          False
Fare           False
Embarked        True
dtype: bool

Imputar valores faltantes

In [None]:
from sklearn.impute import SimpleImputer
import numpy as np

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer.fit(titanic_data)
titanic_data = imputer.transform(titanic_data)

In [None]:
titanic_data[:10]

array([[1, 0, 3, 'male', 22.0, 1, 0, 7.25, 'S'],
       [2, 1, 1, 'female', 38.0, 1, 0, 71.2833, 'C'],
       [3, 1, 3, 'female', 26.0, 0, 0, 7.925, 'S'],
       [4, 1, 1, 'female', 35.0, 1, 0, 53.1, 'S'],
       [5, 0, 3, 'male', 35.0, 0, 0, 8.05, 'S'],
       [6, 0, 3, 'male', 24.0, 0, 0, 8.4583, 'Q'],
       [7, 0, 1, 'male', 54.0, 0, 0, 51.8625, 'S'],
       [8, 0, 3, 'male', 2.0, 3, 1, 21.075, 'S'],
       [9, 1, 3, 'female', 27.0, 0, 2, 11.1333, 'S'],
       [10, 1, 2, 'female', 14.0, 1, 0, 30.0708, 'C']], dtype=object)

### Codificar classes em categorias numéricas

Vamos utilizar o plugin category_encoders do scikit-learn para utilizar o OneHotEncoder em dados textuais

In [None]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m936.7 kB/s[0m eta [36m0:00:00[0m
Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.3


In [None]:
from category_encoders import OneHotEncoder

In [None]:
enc = OneHotEncoder()

Vamos aplicar o encoder as colunas (variáveis independentes) que possuem dados categóricos

In [None]:
enc.fit(titanic_data[:,3])
titanic_data = np.column_stack((titanic_data, enc.transform(titanic_data[:,3])))
titanic_data = np.delete(titanic_data, 3, axis=1)

In [None]:
titanic_data[:10]

array([[1, 0, 3, 22.0, 1, 0, 7.25, 'S', 1, 0],
       [2, 1, 1, 38.0, 1, 0, 71.2833, 'C', 0, 1],
       [3, 1, 3, 26.0, 0, 0, 7.925, 'S', 0, 1],
       [4, 1, 1, 35.0, 1, 0, 53.1, 'S', 0, 1],
       [5, 0, 3, 35.0, 0, 0, 8.05, 'S', 1, 0],
       [6, 0, 3, 24.0, 0, 0, 8.4583, 'Q', 1, 0],
       [7, 0, 1, 54.0, 0, 0, 51.8625, 'S', 1, 0],
       [8, 0, 3, 2.0, 3, 1, 21.075, 'S', 1, 0],
       [9, 1, 3, 27.0, 0, 2, 11.1333, 'S', 0, 1],
       [10, 1, 2, 14.0, 1, 0, 30.0708, 'C', 0, 1]], dtype=object)

In [None]:
enc.fit(titanic_data[:,7])
titanic_data = np.column_stack((titanic_data, enc.transform(titanic_data[:,7])))
titanic_data = np.delete(titanic_data, 7, axis=1)

In [None]:
titanic_data[:10]

array([[1, 0, 3, 22.0, 1, 0, 7.25, 1, 0, 1, 0, 0],
       [2, 1, 1, 38.0, 1, 0, 71.2833, 0, 1, 0, 1, 0],
       [3, 1, 3, 26.0, 0, 0, 7.925, 0, 1, 1, 0, 0],
       [4, 1, 1, 35.0, 1, 0, 53.1, 0, 1, 1, 0, 0],
       [5, 0, 3, 35.0, 0, 0, 8.05, 1, 0, 1, 0, 0],
       [6, 0, 3, 24.0, 0, 0, 8.4583, 1, 0, 0, 0, 1],
       [7, 0, 1, 54.0, 0, 0, 51.8625, 1, 0, 1, 0, 0],
       [8, 0, 3, 2.0, 3, 1, 21.075, 1, 0, 1, 0, 0],
       [9, 1, 3, 27.0, 0, 2, 11.1333, 0, 1, 1, 0, 0],
       [10, 1, 2, 14.0, 1, 0, 30.0708, 0, 1, 0, 1, 0]], dtype=object)

### Criação e avaliação dos modelos de classificação

Separar em treino e teste

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
y = titanic_data[:,1].astype('int')

In [None]:
X = np.delete(titanic_data,1,axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3)

Comparação do score de alguns modelos




In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

In [None]:
lr = LogisticRegression()
gnb = GaussianNB()

In [None]:
lr.fit(X_train, y_train)
gnb.fit(X_train, y_train)

Calcular scores dos classificadores:

In [None]:
print("Score Logistic Regression: {:.2f}".format(lr.score(X_test, y_test)))
print("Score Naive Bayes: {:.2f}".format(gnb.score(X_test, y_test)))

Score Logistic Regression: 0.79
Score Naive Bayes: 0.79


## Definindo um workflow com Scikit-learn pipeline
* A mesma análise anterior, porem, incluindo cada avaliação em um objeto pipe
* Ao final um pipeline com outros pipes sao executados
* Um pipe possui uma ou mais transformações,por exemplo, normalização, codificação (objetos que possuem a função `transform()`), e por último um classificador (que possui a função `fit()`)

In [None]:
titanic_data = pd.read_csv('train_titanic.csv')

# retirando colunas com nome, ingresso e cabine dos conjuntos
titanic_data.drop(["Name", "Ticket", "Cabin"], axis=1, inplace=True)

Separando dados em teste e treino

In [None]:
from sklearn.model_selection import train_test_split
# dividindo em conjunto de treino e test
X_train, X_test, y_train, y_test = train_test_split(titanic_data.drop(['Survived'],
                                                                      axis=1),
                                                    titanic_data['Survived'],
                                                    test_size=0.3,
                                                    random_state=42)

### Criação de pipes

Pipeline para pré-processamento das variáveis `Age` e `Fare`

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from category_encoders import OneHotEncoder

In [None]:
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

Pipeline para pré-processamento das variáveis `Sex` e `Embarked`

In [None]:
cat_transformer = Pipeline(steps=[
    ('one-hot encoder', OneHotEncoder())
])

Concatenação de pre-processadores

In [None]:
from sklearn.compose import ColumnTransformer

In [None]:
preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, ['Age', 'Fare']),
    ('cat', cat_transformer, ['Sex', 'Embarked'])
])

Criando o modelo usando pipeline e um classificador `LogisticRegression`

### Definição do pipeline completo

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

In [None]:
model_lr = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('lr', LogisticRegression())
])

model_nb = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('nb', GaussianNB())
])


Realizar o `fit` dos modelos

In [None]:
model_lr.fit(X_train, y_train)
model_nb.fit(X_train, y_train)

### Avaliação dos modelos definidos nos pipelines

In [None]:
print("Score Logistic Regression: {:.2f}".format(model_lr.score(X_test, y_test)))
print("Score Naive Bayes: {:.2f}".format(model_nb.score(X_test, y_test)))

Score Logistic Regression: 0.78
Score Naive Bayes: 0.71


## Exercício

### Tarefas
* Adicionar [StandardScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html) para a variável `Fare`
* Adicionar [Discretizador](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.KBinsDiscretizer.html#sklearn.preprocessing.KBinsDiscretizer) para a variável `Age`