In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../data/census.csv')
df.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
df.dtypes

age                int64
workclass         object
final-weight       int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loos       int64
hour-per-week      int64
native-country    object
income            object
dtype: object

## Separando previsores e classe

In [4]:
previsores, classe = df.iloc[:,:-1].values, df.iloc[:,-1].values
print(previsores[:2], classe[:2], sep='\n')

[[39 ' State-gov' 77516 ' Bachelors' 13 ' Never-married' ' Adm-clerical'
  ' Not-in-family' ' White' ' Male' 2174 0 40 ' United-States']
 [50 ' Self-emp-not-inc' 83311 ' Bachelors' 13 ' Married-civ-spouse'
  ' Exec-managerial' ' Husband' ' White' ' Male' 0 0 13 ' United-States']]
[' <=50K' ' <=50K']


## Var. categóricas

In [5]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [6]:
indexes = [ i for i, ct in enumerate(df.dtypes[:-1]) if ct == object ]
indexes

[1, 3, 5, 6, 7, 8, 9, 13]

In [7]:
labelencoder_previsores = LabelEncoder()

for i in indexes:
    previsores[:, i] = labelencoder_previsores.fit_transform(previsores[:, i])

In [8]:
previsores[:5]

array([[39, 7, 77516, 9, 13, 4, 1, 1, 4, 1, 2174, 0, 40, 39],
       [50, 6, 83311, 9, 13, 2, 4, 0, 4, 1, 0, 0, 13, 39],
       [38, 4, 215646, 11, 9, 0, 6, 1, 4, 1, 0, 0, 40, 39],
       [53, 4, 234721, 1, 7, 2, 6, 0, 2, 1, 0, 0, 40, 39],
       [28, 4, 338409, 9, 13, 2, 10, 5, 2, 0, 0, 0, 40, 5]], dtype=object)

In [9]:
indexes.pop(1)
indexes

[1, 5, 6, 7, 8, 9, 13]

In [10]:
# onehotencoder = OneHotEncoder(categorical_features=indexes)
# previsores = onehotencoder.fit_transform(previsores).toarray()

In [11]:
previsores[:2]

array([[39, 7, 77516, 9, 13, 4, 1, 1, 4, 1, 2174, 0, 40, 39],
       [50, 6, 83311, 9, 13, 2, 4, 0, 4, 1, 0, 0, 13, 39]], dtype=object)

## Escalonamento

In [12]:
from sklearn.preprocessing import StandardScaler

In [13]:
# scaler = StandardScaler()
# previsores = scaler.fit_transform(previsores)

In [14]:
previsores[:1]

array([[39, 7, 77516, 9, 13, 4, 1, 1, 4, 1, 2174, 0, 40, 39]],
      dtype=object)

## Separando base para treinamento

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
previsores_treinamento, previsores_teste, classe_treinamento, classe_teste = train_test_split(previsores, classe, test_size=.15, random_state=0)

## Árvore de decisão

In [17]:
from sklearn.ensemble import RandomForestClassifier

In [18]:
classificador = RandomForestClassifier(n_estimators=30,
                                       criterion='entropy', 
                                       random_state=0)
classificador.fit(previsores_treinamento, classe_treinamento)
previsoes = classificador.predict(previsores_teste)

## Métricas

In [19]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [20]:
print('Previsão', accuracy_score(classe_teste, previsoes))
print('Confusão', confusion_matrix(classe_teste, previsoes), sep='\n')

Previsão 0.8491299897645854
Confusão
[[3432  261]
 [ 476  716]]


0.8413 - 10 árvores <br>
0.8429 - 15 árvores <br>
0.8485 - 30 árvores <br>
0.8472 - 40 árvores <br>

0.8491 - labelencoder <br>
0.8483 - labelencoder + onehot <br>
0.8487 - labelencoder + escalonamento <br>