#  <span style="color:blue">AutoML com Pycaret</span>

# Prática 05
Nesta aula prática vamos aprender e testar as diferentes formas de transformação de features que o Pycaret nos fornece.

#### Instalando o Pycaret.

In [None]:
pip install pycaret

#### Carregando o módulo de classificação e a função para buscar dados.

In [None]:
from pycaret.classification import *
from pycaret.datasets import get_data
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Conjunto de dados utilizado.

#### Carregando os dados.

In [None]:
dataset = get_data('income')

##### Adicionando a nova feature com baixa variância.

In [None]:
dataset["feature_test"] = "comum_value"

In [None]:
dataset.head()

##### Verificando o percentual de cada valor.

In [None]:
dataset['feature_test'].value_counts(normalize=True) * 100

##### Inserindo um registro raro.

In [None]:
dataset.at[0, 'feature_test'] = "rare"

In [None]:
dataset.head()

In [None]:
index_rare = dataset["feature_test"].sample(frac=0.10).index

In [None]:
dataset.at[index_rare, 'feature_test'] = "rare"

# Configurando o ambiente com a função Setup.

Função Setup inicializa o ambiente no Pycaret e cria o pipeline de transformação e prepara os dados para modelagem e deploy.

In [None]:
exp_clf01 = setup(  data = dataset
                  , target = 'income >50K'
                  , session_id = 123
                  , ignore_low_variance = True
                  , silent = True
                  , verbose = False
                 )

##### Armazenando o Dataframe transformado.

In [None]:
df_transformed = get_config("X_train")

In [None]:
df_transformed.head()

In [None]:
df_transformed.columns

##### Verificando se a feature existe

In [None]:
for feat in df_transformed.columns:
    if "feature_test" in feat:
        print(feat)

# Trabalhando com registros categóricos raros.

##### Listando os registros e suas distribuições na feature native-country.

In [27]:
dataset["native-country"].value_counts(ascending=True)

Holand-Netherlands                1
Scotland                         12
Honduras                         13
Hungary                          13
Outlying-US(Guam-USVI-etc)       14
Yugoslavia                       16
Laos                             18
Thailand                         18
Cambodia                         19
Trinadad&Tobago                  19
Hong                             20
Ireland                          24
Ecuador                          28
Greece                           29
France                           29
Peru                             31
Nicaragua                        34
Portugal                         37
Iran                             43
Haiti                            44
Taiwan                           51
Columbia                         59
Poland                           60
Japan                            62
Guatemala                        64
Vietnam                          67
Dominican-Republic               70
Italy                       

##### Listando os valores até o quartil de 25%

In [29]:
lista = dataset["native-country"].value_counts(ascending=True).values

In [30]:
lista

array([    1,    12,    13,    13,    14,    16,    18,    18,    19,
          19,    20,    24,    28,    29,    29,    31,    34,    37,
          43,    44,    51,    59,    60,    62,    64,    67,    70,
          73,    75,    80,    81,    90,    95,   100,   106,   114,
         121,   137,   198,   643, 29170])

In [31]:
np.percentile(lista,25)

20.0

#### Definindo o Pipeline com a função Setup.

In [32]:
exp_clf01 = setup(  data = dataset
                  , target = 'income >50K'
                  , session_id = 123
                  , combine_rare_levels = True
                  , rare_level_threshold = 0.25
                  , silent = True
                 )

Unnamed: 0,Description,Value
0,session_id,123
1,Target,income >50K
2,Target Type,Binary
3,Label Encoded,"0: 0, 1: 1"
4,Original Data,"(32561, 15)"
5,Missing Values,True
6,Numeric Features,4
7,Categorical Features,10
8,Ordinal Features,False
9,High Cardinality Features,False


##### Armazenando o Dataframe transformado.

In [33]:
df_transformed = get_config("X_train")

In [34]:
df_transformed.head()

Unnamed: 0,age,capital-gain,capital-loss,hours-per-week,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_others_infrequent,...,native-country_Poland,native-country_Portugal,native-country_Puerto-Rico,native-country_South,native-country_Taiwan,native-country_United-States,native-country_Vietnam,native-country_not_available,native-country_others_infrequent,feature_test_comum_value
19599,55.0,0.0,0.0,40.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
12982,49.0,0.0,0.0,40.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
15929,32.0,0.0,0.0,40.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
29192,52.0,0.0,0.0,40.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
13704,48.0,0.0,0.0,50.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


##### Verificando as features criadas.

In [35]:
for c in df_transformed.columns:
    if "native-country" in c:
        print(c)

native-country_Canada
native-country_China
native-country_Columbia
native-country_Cuba
native-country_Dominican-Republic
native-country_Ecuador
native-country_El-Salvador
native-country_England
native-country_France
native-country_Germany
native-country_Greece
native-country_Guatemala
native-country_Haiti
native-country_India
native-country_Iran
native-country_Italy
native-country_Jamaica
native-country_Japan
native-country_Mexico
native-country_Nicaragua
native-country_Peru
native-country_Philippines
native-country_Poland
native-country_Portugal
native-country_Puerto-Rico
native-country_South
native-country_Taiwan
native-country_United-States
native-country_Vietnam
native-country_not_available
native-country_others_infrequent
