In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../data/census.csv')
df.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


## Pré-processamento

In [3]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

In [4]:
df.dtypes

age                int64
workclass         object
final-weight       int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loos       int64
hour-per-week      int64
native-country    object
income            object
dtype: object

In [5]:
df.describe()

Unnamed: 0,age,final-weight,education-num,capital-gain,capital-loos,hour-per-week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [6]:
previsores = df.iloc[:, :14].values
classe = df.iloc[:, 14].values
print(previsores[:2], classe[:2], sep='\n')

[[39 ' State-gov' 77516 ' Bachelors' 13 ' Never-married' ' Adm-clerical'
  ' Not-in-family' ' White' ' Male' 2174 0 40 ' United-States']
 [50 ' Self-emp-not-inc' 83311 ' Bachelors' 13 ' Married-civ-spouse'
  ' Exec-managerial' ' Husband' ' White' ' Male' 0 0 13 ' United-States']]
[' <=50K' ' <=50K']


In [7]:
labelencoder = LabelEncoder()

categ = []
for i, tp in enumerate(df.dtypes):
    if tp == object:
        categ.append(i)
categ.pop()
categ

[1, 3, 5, 6, 7, 8, 9, 13]

In [8]:
classe = labelencoder.fit_transform(classe)
for i in categ:
    previsores[:, i] = labelencoder.fit_transform(previsores[:, i])
    
print(previsores[:2])

[[39 7 77516 9 13 4 1 1 4 1 2174 0 40 39]
 [50 6 83311 9 13 2 4 0 4 1 0 0 13 39]]


In [9]:
categ.pop(1)
categ

[1, 5, 6, 7, 8, 9, 13]

In [10]:
onehot = OneHotEncoder(categorical_features=categ)
previsores = onehot.fit_transform(previsores).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [11]:
scaler = StandardScaler()
previsores = scaler.fit_transform(previsores)

In [12]:
previsores[:2]

array([[-0.2444502 , -0.17429511, -0.26209736, -0.01466381, -1.5167923 ,
        -0.18838933, -0.29093568,  4.90769968, -0.02073999, -0.39750806,
        -0.02658695, -0.92284068, -0.11403678,  1.43105786, -0.1802846 ,
        -0.17735813, -0.24494366,  2.76348874, -0.01662771, -0.37949517,
        -0.37774555, -0.17745022, -0.20957797, -0.25595432, -0.33554133,
        -0.06780164, -0.38166338, -0.14260848, -0.35531609, -0.17127887,
        -0.22710355, -0.82533335,  1.70899099, -0.17624972, -0.42934582,
        -0.34403232, -0.22492681, -0.09820087, -0.18155194, -0.32576824,
        -0.09161163,  0.4130197 , -0.70307135,  0.70307135, -0.13502327,
        -0.02416321, -0.06107342, -0.0480488 , -0.04260602, -0.05409379,
        -0.04641598, -0.02933708, -0.05714946, -0.05264698, -0.02985682,
        -0.06500204, -0.02985682, -0.04437806, -0.03678503, -0.00554189,
        -0.01998525, -0.02479131, -0.01998525, -0.05550333, -0.03636406,
        -0.02715919, -0.04740236, -0.04993839, -0.0

## Separando base para treinamento

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
previsores_treinamento, previsores_teste, classe_treinamento, classe_teste = train_test_split(previsores, classe, test_size=.15, random_state=0)

## Regressão Logística

In [15]:
from sklearn.linear_model import LogisticRegression

In [16]:
classificador = LogisticRegression()
classificador = classificador.fit(previsores_treinamento, classe_treinamento)
previsoes = classificador.predict(previsores_teste)



## Métricas

In [17]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [18]:
print('Previsão', accuracy_score(classe_teste, previsoes))
print('Confusão', confusion_matrix(classe_teste, previsoes), sep='\n')

Previsão 0.8479017400204708
Confusão
[[3413  280]
 [ 463  729]]
