In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../data/census.csv')
df.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
df.dtypes

age                int64
workclass         object
final-weight       int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loos       int64
hour-per-week      int64
native-country    object
income            object
dtype: object

## Separando previsores e classe

In [4]:
previsores, classe = df.iloc[:,:-1].values, df.iloc[:,-1].values
print(previsores[:2], classe[:2], sep='\n')

[[39 ' State-gov' 77516 ' Bachelors' 13 ' Never-married' ' Adm-clerical'
  ' Not-in-family' ' White' ' Male' 2174 0 40 ' United-States']
 [50 ' Self-emp-not-inc' 83311 ' Bachelors' 13 ' Married-civ-spouse'
  ' Exec-managerial' ' Husband' ' White' ' Male' 0 0 13 ' United-States']]
[' <=50K' ' <=50K']


## Var. categóricas

In [5]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [6]:
indexes = [ i for i, ct in enumerate(df.dtypes[:-1]) if ct == object ]
indexes

[1, 3, 5, 6, 7, 8, 9, 13]

In [7]:
labelencoder_previsores = LabelEncoder()

for i in indexes:
    previsores[:, i] = labelencoder_previsores.fit_transform(previsores[:, i])

In [8]:
previsores[:5]

array([[39, 7, 77516, 9, 13, 4, 1, 1, 4, 1, 2174, 0, 40, 39],
       [50, 6, 83311, 9, 13, 2, 4, 0, 4, 1, 0, 0, 13, 39],
       [38, 4, 215646, 11, 9, 0, 6, 1, 4, 1, 0, 0, 40, 39],
       [53, 4, 234721, 1, 7, 2, 6, 0, 2, 1, 0, 0, 40, 39],
       [28, 4, 338409, 9, 13, 2, 10, 5, 2, 0, 0, 0, 40, 5]], dtype=object)

In [9]:
indexes.pop(1)
indexes

[1, 5, 6, 7, 8, 9, 13]

In [10]:
onehotencoder = OneHotEncoder(categorical_features=indexes)
previsores = onehotencoder.fit_transform(previsores).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [11]:
previsores[:2]

array([[0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 1.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.00

## Escalonamento

In [12]:
from sklearn.preprocessing import StandardScaler

In [13]:
scaler = StandardScaler()
previsores = scaler.fit_transform(previsores)

In [14]:
previsores[:1]

array([[-0.2444502 , -0.17429511, -0.26209736, -0.01466381, -1.5167923 ,
        -0.18838933, -0.29093568,  4.90769968, -0.02073999, -0.39750806,
        -0.02658695, -0.92284068, -0.11403678,  1.43105786, -0.1802846 ,
        -0.17735813, -0.24494366,  2.76348874, -0.01662771, -0.37949517,
        -0.37774555, -0.17745022, -0.20957797, -0.25595432, -0.33554133,
        -0.06780164, -0.38166338, -0.14260848, -0.35531609, -0.17127887,
        -0.22710355, -0.82533335,  1.70899099, -0.17624972, -0.42934582,
        -0.34403232, -0.22492681, -0.09820087, -0.18155194, -0.32576824,
        -0.09161163,  0.4130197 , -0.70307135,  0.70307135, -0.13502327,
        -0.02416321, -0.06107342, -0.0480488 , -0.04260602, -0.05409379,
        -0.04641598, -0.02933708, -0.05714946, -0.05264698, -0.02985682,
        -0.06500204, -0.02985682, -0.04437806, -0.03678503, -0.00554189,
        -0.01998525, -0.02479131, -0.01998525, -0.05550333, -0.03636406,
        -0.02715919, -0.04740236, -0.04993839, -0.0

## Separando base para treinamento

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
previsores_treinamento, previsores_teste, classe_treinamento, classe_teste = train_test_split(previsores, classe, test_size=.15, random_state=0)

## kNN

In [17]:
from sklearn.neighbors import KNeighborsClassifier

In [18]:
classificador = KNeighborsClassifier(n_neighbors=5, 
                                     metric='minkowski',
                                     p=2)
classificador.fit(previsores_treinamento, classe_treinamento)
previsoes = classificador.predict(previsores_teste)

## Métricas

In [19]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [20]:
print('Acurácia', accuracy_score(classe_teste, previsoes))
print('Confusão', confusion_matrix(classe_teste, previsoes), sep='\n')

Acurácia 0.8200614124872058
Confusão
[[3349  344]
 [ 535  657]]


## Base Line Classifier

In [21]:
from collections import Counter

In [22]:
Counter(classe_teste).most_common(1)[0][1]/len(classe_teste)

0.7559877175025589