#  <span style="color:blue">AutoML com Pycaret</span>

# Prática 02
Nesta aula prática vamos aprender e testar as diferentes formas de transformação de features que o Pycaret nos fornece.

#### Instalando o Pycaret.

In [None]:
pip install pycaret

#### Carregando o módulo de classificação e a função para buscar dados.

In [1]:
from pycaret.classification import *
from pycaret.datasets import get_data
import pandas as pd
import numpy as np
import seaborn as sns

# Conjunto de dados utilizado.

#### Carregando os dados.

In [2]:
dataset = get_data('income')

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income >50K
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


#### Listando registros missing por atributo.

In [3]:
dataset.isnull().sum()

age                  0
workclass         1836
education            0
education-num        0
marital-status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     583
income >50K          0
dtype: int64

# Configurando o ambiente com a função Setup.

Função Setup inicializa o ambiente no Pycaret e cria o pipeline de transformação e prepara os dados para modelagem e deploy.

In [16]:
exp_clf01 = setup(  data = dataset
                  , target = 'income >50K'
                  , session_id = 123
                  , categorical_imputation = "mode"
                 )

Unnamed: 0,Description,Value
0,session_id,123
1,Target,income >50K
2,Target Type,Binary
3,Label Encoded,"0: 0, 1: 1"
4,Original Data,"(32561, 14)"
5,Missing Values,True
6,Numeric Features,4
7,Categorical Features,9
8,Ordinal Features,False
9,High Cardinality Features,False


##### Verificando o objeto criado.

In [5]:
exp_clf01

({'USI',
  'X',
  'X_test',
  'X_train',
  '_all_metrics',
  '_all_models',
  '_all_models_internal',
  '_available_plots',
  '_gpu_n_jobs_param',
  '_internal_pipeline',
  '_ml_usecase',
  'create_model_container',
  'data_before_preprocess',
  'display_container',
  'exp_name_log',
  'experiment__',
  'fix_imbalance_method_param',
  'fix_imbalance_param',
  'fold_generator',
  'fold_groups_param',
  'fold_groups_param_full',
  'fold_param',
  'fold_shuffle_param',
  'gpu_param',
  'html_param',
  'imputation_classifier',
  'imputation_regressor',
  'iterative_imputation_iters_param',
  'log_plots_param',
  'logging_param',
  'master_model_container',
  'n_jobs_param',
  'prep_pipe',
  'pycaret_globals',
  'seed',
  'stratify_param',
  'target_param',
  'transform_target_method_param',
  'transform_target_param',
  'y',
  'y_test',
  'y_train'},
 False,
 None,
 Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
           

##### Armazenando o Dataframe transformado.

In [17]:
df_transformed = get_config("X_train")

In [18]:
df_transformed.head()

Unnamed: 0,age,capital-gain,capital-loss,hours-per-week,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
19599,55.0,0.0,0.0,40.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
12982,49.0,0.0,0.0,40.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
15929,32.0,0.0,0.0,40.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
29192,52.0,0.0,0.0,40.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
13704,48.0,0.0,0.0,50.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


##### Realizando a contagem de registros nulos por atributo.

In [19]:
df_transformed.isnull().sum()

age                               0
capital-gain                      0
capital-loss                      0
hours-per-week                    0
workclass_Federal-gov             0
                                 ..
native-country_Thailand           0
native-country_Trinadad&Tobago    0
native-country_United-States      0
native-country_Vietnam            0
native-country_Yugoslavia         0
Length: 101, dtype: int64

##### Verificando as features transformadas.

In [20]:
for column in df_transformed.columns:
    if column.startswith("occupation"):
        print(column)

occupation_Adm-clerical
occupation_Armed-Forces
occupation_Craft-repair
occupation_Exec-managerial
occupation_Farming-fishing
occupation_Handlers-cleaners
occupation_Machine-op-inspct
occupation_Other-service
occupation_Priv-house-serv
occupation_Prof-specialty
occupation_Protective-serv
occupation_Sales
occupation_Tech-support
occupation_Transport-moving


In [21]:
for column in df_transformed.columns:
    if column.startswith("native-country"):
        print(column)

native-country_Cambodia
native-country_Canada
native-country_China
native-country_Columbia
native-country_Cuba
native-country_Dominican-Republic
native-country_Ecuador
native-country_El-Salvador
native-country_England
native-country_France
native-country_Germany
native-country_Greece
native-country_Guatemala
native-country_Haiti
native-country_Honduras
native-country_Hong
native-country_Hungary
native-country_India
native-country_Iran
native-country_Ireland
native-country_Italy
native-country_Jamaica
native-country_Japan
native-country_Laos
native-country_Mexico
native-country_Nicaragua
native-country_Outlying-US(Guam-USVI-etc)
native-country_Peru
native-country_Philippines
native-country_Poland
native-country_Portugal
native-country_Puerto-Rico
native-country_Scotland
native-country_South
native-country_Taiwan
native-country_Thailand
native-country_Trinadad&Tobago
native-country_United-States
native-country_Vietnam
native-country_Yugoslavia


##### Verificando os valores mais frequentes.

In [13]:
dataset["occupation"].value_counts()

Prof-specialty       4140
Craft-repair         4099
Exec-managerial      4066
Adm-clerical         3770
Sales                3650
Other-service        3295
Machine-op-inspct    2002
Transport-moving     1597
Handlers-cleaners    1370
Farming-fishing       994
Tech-support          928
Protective-serv       649
Priv-house-serv       149
Armed-Forces            9
Name: occupation, dtype: int64

In [14]:
dataset["native-country"].value_counts()

United-States                 29170
Mexico                          643
Philippines                     198
Germany                         137
Canada                          121
Puerto-Rico                     114
El-Salvador                     106
India                           100
Cuba                             95
England                          90
Jamaica                          81
South                            80
China                            75
Italy                            73
Dominican-Republic               70
Vietnam                          67
Guatemala                        64
Japan                            62
Poland                           60
Columbia                         59
Taiwan                           51
Haiti                            44
Iran                             43
Portugal                         37
Nicaragua                        34
Peru                             31
France                           29
Greece                      