<a href="https://colab.research.google.com/github/sirleudo/sigmoidal_data_science/blob/master/Lidando_com_vari%C3%A1veis_categ%C3%B3ricas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<img alt="Colaboratory logo" width="15%" src="https://github.com/sirleudo/sigmoidal_data_science/blob/master/logo_pensatah.png?raw=true">

#### **Data Science na Prática 2.0**
*by Sirleudo Evarsito*

# Lidando com variáveis categóricas
Em machine learning, muitos modelos não conseguirão lidar diretamente com variáveis categóricas. Dessa maneira, é importante conhecer os principais métodos e saber como aplicá-los.

Nesta aula veremos como usar o LabelEncoder e OneHotEncoder. Mais que isso, vou te mostrar algumas situações onde colunas numéricas são, na verdade, variáveis categóricas.

Para exemplificar o uso dessas técnicas, vou usar o dataset de câncer de mama, disponibilizado pela UCI Machine Learning Repository.

In [14]:
# importar os pacotes necessários
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# importar arquivo csv
df = pd.read_csv("https://raw.githubusercontent.com/carlosfab/dsnp2/master/datasets/breast-cancer.data", header=None,
                 names=["class", "age", "menopause", "tumor_size",
                        "inv_nodes", "nodes-caps", "deg_malig", "breast",
                        "breast_quad", "irradiat"])

# ver as entradas
df.head()

Unnamed: 0,class,age,menopause,tumor_size,inv_nodes,nodes-caps,deg_malig,breast,breast_quad,irradiat
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no


In [56]:
# ver quantidade de linhas e colunas do dataframe
print(f'Número de linhas:\t {df.shape[0]}')
print(f'Número de colunas:\t {df.shape[1]}')

Número de linhas:	 286
Número de colunas:	 10


In [57]:
# verificar dados ausentes
df.isnull().sum()

class          0
age            0
menopause      0
tumor_size     0
inv_nodes      0
nodes-caps     0
deg_malig      0
breast         0
breast_quad    0
irradiat       0
dtype: int64

In [58]:
# verificar tipos de dados
df.dtypes

class          object
age            object
menopause      object
tumor_size     object
inv_nodes      object
nodes-caps     object
deg_malig       int64
breast         object
breast_quad    object
irradiat       object
dtype: object

In [64]:
# verificar quantidade de valores unicos da feature class
df['class'].value_counts()

no-recurrence-events    201
recurrence-events        85
Name: class, dtype: int64

In [68]:
# verificar valores unicos usando metodo unique
df['class'].unique()

array(['no-recurrence-events', 'recurrence-events'], dtype=object)

In [4]:
# dividir dataframe em treino e teste
X = df.drop('class', axis=1)
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [6]:
X.head()

Unnamed: 0,age,menopause,tumor_size,inv_nodes,nodes-caps,deg_malig,breast,breast_quad,irradiat
0,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,40-49,premeno,0-4,0-2,no,2,right,right_low,no


In [7]:
y.head()

0    no-recurrence-events
1    no-recurrence-events
2    no-recurrence-events
3    no-recurrence-events
4    no-recurrence-events
Name: class, dtype: object

# Label encoding

In [27]:
# y_train antes do encoding
y_train

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1])

In [20]:
# y_test antes do encoding
y_test

113    no-recurrence-events
23     no-recurrence-events
95     no-recurrence-events
15     no-recurrence-events
60     no-recurrence-events
               ...         
125    no-recurrence-events
1      no-recurrence-events
67     no-recurrence-events
84     no-recurrence-events
193    no-recurrence-events
Name: class, Length: 72, dtype: object

In [22]:
# codificando a variável alvo
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(y_train)
y_train = le.transform(y_train)
y_test = le.transform(y_test)

In [23]:
# y_train depois de treinado
y_train

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1])

In [28]:
# Y_test depois do encoding
y_test

array([0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0])

In [34]:
# visualizar as classe (fase do fit)
le.classes_

array(['no-recurrence-events', 'recurrence-events'], dtype=object)

In [35]:
# recuperando e convertendo os labels
le.inverse_transform(y_train)[:5]

array(['no-recurrence-events', 'no-recurrence-events',
       'no-recurrence-events', 'no-recurrence-events',
       'no-recurrence-events'], dtype=object)

# One-hot encoding

In [37]:
# X_train antes do OnehHotEncoder
X_train

Unnamed: 0,age,menopause,tumor_size,inv_nodes,nodes-caps,deg_malig,breast,breast_quad,irradiat
198,50-59,ge40,15-19,0-2,yes,2,left,central,yes
138,30-39,premeno,40-44,0-2,no,2,left,left_low,yes
196,50-59,premeno,10-14,3-5,no,1,right,left_up,no
200,60-69,ge40,25-29,0-2,no,3,right,left_low,no
14,40-49,premeno,30-34,0-2,no,3,left,left_up,no
...,...,...,...,...,...,...,...,...,...
80,50-59,ge40,10-14,0-2,no,2,right,left_low,no
3,60-69,ge40,15-19,0-2,no,2,right,left_up,no
273,60-69,ge40,30-34,0-2,yes,2,right,right_up,yes
127,30-39,premeno,25-29,6-8,yes,2,right,left_up,yes


In [38]:
from sklearn.preprocessing import OneHotEncoder

le = OneHotEncoder()
le.fit(X_train)
X_train_enc = le.transform(X_train)

In [40]:
X_train_enc

<214x43 sparse matrix of type '<class 'numpy.float64'>'
	with 1926 stored elements in Compressed Sparse Row format>

In [41]:
X_train_enc.toarray()

array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 1., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 1.],
       [0., 1., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 1., 0.]])

# Dummies values

In [43]:
pd.get_dummies(df, columns=['menopause', 'breast'])

Unnamed: 0,class,age,tumor_size,inv_nodes,nodes-caps,deg_malig,breast_quad,irradiat,menopause_ge40,menopause_lt40,menopause_premeno,breast_left,breast_right
0,no-recurrence-events,30-39,30-34,0-2,no,3,left_low,no,0,0,1,1,0
1,no-recurrence-events,40-49,20-24,0-2,no,2,right_up,no,0,0,1,0,1
2,no-recurrence-events,40-49,20-24,0-2,no,2,left_low,no,0,0,1,1,0
3,no-recurrence-events,60-69,15-19,0-2,no,2,left_up,no,1,0,0,0,1
4,no-recurrence-events,40-49,0-4,0-2,no,2,right_low,no,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
281,recurrence-events,30-39,30-34,0-2,no,2,left_up,no,0,0,1,1,0
282,recurrence-events,30-39,20-24,0-2,no,3,left_up,yes,0,0,1,1,0
283,recurrence-events,60-69,20-24,0-2,no,1,left_up,no,1,0,0,0,1
284,recurrence-events,40-49,30-34,3-5,no,3,left_low,no,1,0,0,1,0


In [44]:
pd.get_dummies(df)

Unnamed: 0,deg_malig,class_no-recurrence-events,class_recurrence-events,age_20-29,age_30-39,age_40-49,age_50-59,age_60-69,age_70-79,menopause_ge40,menopause_lt40,menopause_premeno,tumor_size_0-4,tumor_size_10-14,tumor_size_15-19,tumor_size_20-24,tumor_size_25-29,tumor_size_30-34,tumor_size_35-39,tumor_size_40-44,tumor_size_45-49,tumor_size_5-9,tumor_size_50-54,inv_nodes_0-2,inv_nodes_12-14,inv_nodes_15-17,inv_nodes_24-26,inv_nodes_3-5,inv_nodes_6-8,inv_nodes_9-11,nodes-caps_?,nodes-caps_no,nodes-caps_yes,breast_left,breast_right,breast_quad_?,breast_quad_central,breast_quad_left_low,breast_quad_left_up,breast_quad_right_low,breast_quad_right_up,irradiat_no,irradiat_yes
0,3,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0
1,2,1,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,1,0
2,2,1,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0
3,2,1,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0
4,2,1,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
281,2,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,1,0
282,3,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1
283,1,0,1,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0
284,3,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0


In [54]:
df_enc = pd.get_dummies(df)

In [55]:
print(f'Linhas: {df_enc.shape[0]}')
print(f'Colunas: {df_enc.shape[1]}')

Linhas: 286
Colunas: 43
