# Pipelines e Grid Search

In [None]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn import set_config

In [None]:
pd.options.display.max_columns = None

In [None]:
sns.set(font_scale=1.4)
sns.set_style({'font.family': 'serif',
               'fontname': 'Times New Roman'})

In [None]:
mpl.rcParams['figure.dpi'] = 100

In [None]:
set_config(display='diagram')

## 1) Base de dados: Titanic

In [None]:
df_treino = pd.read_csv('../input/diabetes2/Dados_Treino.csv')
df_treino.head()

In [None]:
df_teste = pd.read_csv('../input/diabetes2/Dados_Teste.csv')
df_teste.head()

In [None]:
peso_medio = df_treino['weight'].mean()
idade_media = df_treino['age'].mean()

In [None]:
df_treino['weight'] = df_treino['weight'].fillna(peso_medio)
df_treino['age'] = df_treino['age'].fillna(idade_media)

In [None]:
df_treino['height'] = df_treino['height'] / 100
df_teste['height'] = df_teste['height'] / 100

In [None]:
def setfillna_bmi(linha_df) : 
  if (pd.isnull(linha_df['bmi'])) :
    return linha_df['weight'] / (linha_df['height'] * linha_df['height'])
  else :
    return linha_df['bmi']

In [None]:
df_treino['bmi'] = df_treino[['bmi','weight','height']].apply(lambda x : setfillna_bmi(x), axis=1)
df_teste['bmi'] = df_teste[['bmi','weight','height']].apply(lambda x : setfillna_bmi(x), axis=1)

In [None]:
df_treino = df_treino.dropna()

In [None]:
df_teste = df_treino.dropna()

### Pipelines e Grid Seach 

### 1) Pre-Processamento 

In [None]:
dados_numericos = df_treino.loc[:,'hospital_id':'solid_tumor_with_metastasis'].select_dtypes(include=['int64', 'float64']).columns
dados_categoricos = df_treino.loc[:,'hospital_id':'solid_tumor_with_metastasis'].select_dtypes(include=['object', 'bool']).columns

In [None]:
dados_numericos

In [None]:
dados_categoricos

In [None]:
X = df_treino.loc[:,'hospital_id':'solid_tumor_with_metastasis']
y = df_treino['diabetes_mellitus']

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    stratify = y,
                                                    test_size=0.25, 
                                                    random_state=0)

print('Amostras de treino:')
print(f' * X_train: {X_train.shape}')
print(f' * y_train: {y_train.shape}')

print('Amostras de teste:')
print(f' * X_test: {X_test.shape}')
print(f' * y_test: {y_test.shape}')

## 2. Pipelines

* [OneHotEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html)
* [MinMaxScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html)
* [ColumnTransformer](https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html)
* [Pipeline](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html)

In [None]:
transformers=[('cat', OneHotEncoder(handle_unknown="ignore"), dados_categoricos),
              ('num', MinMaxScaler(), dados_numericos)]

preprocessor = ColumnTransformer(transformers = transformers)

steps=[('preprocessor', preprocessor),
       ('tree_model', DecisionTreeClassifier(criterion = 'entropy',
                                             max_depth = 3, 
                                             min_samples_split = 2,
                                             random_state = 0))]

model = Pipeline(steps = steps,
                 verbose = 1)

In [None]:
model

In [None]:
model.fit(X_train, y_train)
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

In [None]:
train_score

In [None]:
test_score

## 3. Pipelines e validação cruzada

* KFold
* cross_val_score

In [None]:
cv = KFold(n_splits = 5, 
           shuffle = True, 
           random_state=0)

n_scores = cross_val_score(model, 
                           X_train, y_train, 
                           scoring = 'accuracy', 
                           cv = cv)

In [None]:
print('Validação Cruzada\n')
print(f'Acurácias: {n_scores}\n')
print(f'Acurácias (média): {round(np.mean(n_scores), 4)}')
print(f'Acurácias (desvio padrão): {round(np.std(n_scores), 4)}')

## 4. Pipelines e Grid Search

* [GridSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)

In [None]:
%%time

params = {'tree_model__criterion': ['entropy', 'gini'], 
          'tree_model__max_depth': [3, 4, 6, 8, 10],
          'tree_model__min_samples_split': [2, 4, 6, 8],
          'tree_model__random_state': [0]}

grid = GridSearchCV(model, 
                    param_grid = params,
                    cv = cv)

grid.fit(X_train, y_train)

In [None]:
grid.best_params_

In [None]:
grid.best_score_