# Tutorial sobre Pre-Processamento

In [1]:
import pandas as pd

In [17]:
df = pd.read_csv('exemplo.csv')

In [18]:
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,
3,,,,
4,2.0,3.0,1.0,5.0


## Checagem de valores nulos / faltantes

In [19]:
# df.isnull()
df.isna()

Unnamed: 0,A,B,C,D
0,False,False,False,False
1,False,False,True,False
2,False,False,False,True
3,True,True,True,True
4,False,False,False,False


In [20]:
df.isna().sum()

A    1
B    1
C    2
D    2
dtype: int64

## Eliminar amostras com valores faltantes

In [21]:
#df.dropna(axis=0, inplace=True) # inplace altera o df original
df.dropna(axis=0)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
4,2.0,3.0,1.0,5.0


## Eliminar features com valores faltantes

In [22]:
df.dropna(axis=1)

0
1
2
3
4


In [23]:
# elimina linha com todos os valores faltantes
df.dropna(how='all')

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,
4,2.0,3.0,1.0,5.0


In [25]:
df.dropna(thresh=2)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,
4,2.0,3.0,1.0,5.0


In [26]:
df.dropna(subset=['C'])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,10.0,11.0,12.0,
4,2.0,3.0,1.0,5.0


In [29]:
df.drop(labels=[0, 1])

Unnamed: 0,A,B,C,D
2,10.0,11.0,12.0,
3,,,,
4,2.0,3.0,1.0,5.0


In [30]:
df.drop(labels=['A', 'C'], axis=1)

Unnamed: 0,B,D
0,2.0,4.0
1,6.0,8.0
2,11.0,
3,,
4,3.0,5.0


## Assinalar valores aos faltantes

In [31]:
from sklearn.preprocessing import Imputer

In [33]:
df_ = df.dropna(how='all')

In [36]:
# aqui o eixo 0 é o eixo das colunas
prep = Imputer(missing_values='NaN', strategy='mean', axis=0)
prep = prep.fit(df_)
dados = prep.transform(df_)
dados

array([[ 1.        ,  2.        ,  3.        ,  4.        ],
       [ 5.        ,  6.        ,  5.33333333,  8.        ],
       [10.        , 11.        , 12.        ,  5.66666667],
       [ 2.        ,  3.        ,  1.        ,  5.        ]])

In [40]:
# Se desejar recriar um dataframe
# pd.DataFrame(dados, columns=df_.columns)

/bin/sh: conda: command not found


In [109]:
df2 = pd.read_excel('planilha1.xlsx')

In [110]:
df2

Unnamed: 0,color,size,price,label
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1


In [111]:
tam = { 'M': 1, 'L': 2, 'XL': 3 }

In [112]:
# criando um dicionário invertido em Python
tam_inv = { v: k for k, v in tam.items() }
tam_inv

{1: 'M', 2: 'L', 3: 'XL'}

In [113]:
df2['size'] = df2['size'].map(tam)

In [114]:
df2

Unnamed: 0,color,size,price,label
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class1


In [115]:
df2['size'].map(tam_inv)

0     M
1     L
2    XL
Name: size, dtype: object

In [116]:
df2['label'].unique()

array(['class1', 'class2'], dtype=object)

In [117]:
for i, o in enumerate(df2['label'].unique()):
    print(i, o)

0 class1
1 class2


In [118]:
labels_num = { v:k for k, v in enumerate(df2['label'].unique()) }
labels_num

{'class1': 0, 'class2': 1}

In [119]:
df2['label'] = df2['label'].map(labels_num)
df2

Unnamed: 0,color,size,price,label
0,green,1,10.1,0
1,red,2,13.5,1
2,blue,3,15.3,0


In [120]:
pd.get_dummies(df2['color'], drop_first=True)

Unnamed: 0,green,red
0,1,0
1,0,1
2,0,0


In [125]:
df2 = pd.get_dummies(df2, drop_first=True)

In [126]:
df2

Unnamed: 0,size,price,label,color_green,color_red
0,1,10.1,0,1,0
1,2,13.5,1,0,1
2,3,15.3,0,0,0


## Usando o scikit para transformar features categórica em numérica

In [122]:
from sklearn.preprocessing import LabelEncoder

In [127]:
df3 = pd.read_excel('planilha1.xlsx')
df3

Unnamed: 0,color,size,price,label
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1


In [128]:
le = LabelEncoder()
le.fit_transform(df3['label'])

array([0, 1, 0])

In [131]:
df3['label'] = le.fit_transform(df3['label'])
df3['label']

0    0
1    1
2    0
Name: label, dtype: int64

In [132]:
le.inverse_transform(df3['label'])

array([0, 1, 0])

In [133]:
df3

Unnamed: 0,color,size,price,label
0,green,M,10.1,0
1,red,L,13.5,1
2,blue,XL,15.3,0


## Fazendo one-hot-encoding com scikit

In [92]:
from sklearn.preprocessing import OneHotEncoder

In [134]:
le = LabelEncoder()
df3['color'] = le.fit_transform(df3['color'])
df3

Unnamed: 0,color,size,price,label
0,1,M,10.1,0
1,2,L,13.5,1
2,0,XL,15.3,0


In [137]:
df3['size'] = df3['size'].map(tam)

In [148]:
import sklearn
sklearn.__version__

'0.20.2'

In [149]:
ohe = OneHotEncoder(categorical_features=[0], sparse=False)
ohe.fit_transform(df3)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


array([[ 0. ,  1. ,  0. ,  1. , 10.1,  0. ],
       [ 0. ,  0. ,  1. ,  2. , 13.5,  1. ],
       [ 1. ,  0. ,  0. ,  3. , 15.3,  0. ]])