# Lendo Valores

In [2]:
import pandas
import numpy as np
import time

full_data = pandas.read_csv('census.csv')
""" Remove linhas com NaN """
data = full_data.dropna(axis=0).values
print (data.shape)
columns = list(full_data.columns.values)

(5532, 14)


In [3]:
np.unique(full_data.values[:, 5])

array([' Adm-clerical', ' Armed-Forces', ' Craft-repair',
       ' Exec-managerial', ' Farming-fishing', ' Handlers-cleaners',
       ' Machine-op-inspct', ' Other-service', ' Priv-house-serv',
       ' Prof-specialty', ' Protective-serv', ' Sales', ' Tech-support',
       ' Transport-moving'], dtype=object)

In [4]:
full_data

Unnamed: 0,age,workclass,education_level,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38,Private,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,53,Private,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,28,Private,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K
5,37,Private,Masters,14.0,Married-civ-spouse,Exec-managerial,Wife,White,Female,0.0,0.0,40.0,United-States,<=50K
6,49,Private,9th,5.0,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0.0,0.0,16.0,Jamaica,<=50K
7,52,Self-emp-not-inc,HS-grad,9.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,45.0,United-States,>50K
8,31,Private,Masters,14.0,Never-married,Prof-specialty,Not-in-family,White,Female,14084.0,0.0,50.0,United-States,>50K
9,42,Private,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178.0,0.0,40.0,United-States,>50K


# Codificando todas as colunas string

In [5]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
encoded_data = data
column_index = 0
for i in data[0,:]:
    if (type(i) == str):
        encoded_data[:,column_index] = le.fit_transform(encoded_data[:, column_index])
    column_index +=1

# Separando Dados Dos Rótulos de Classificação

In [6]:
X = encoded_data[:, :13].astype('float64')
y = encoded_data[:, 13].astype('float64')

# Particionando dados para treinamento e testes

In [7]:
from sklearn.model_selection import train_test_split
""" stratify garante mesma proporção de classes de teste e treinamento"""
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   test_size=0.2,
                                                   random_state=0,
                                                   stratify=y)

# Standardized scaling de todos os Dados

In [8]:
from sklearn.preprocessing import StandardScaler
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)

In [9]:
X_train_std.dtype

dtype('float64')

# SVM usando std

In [15]:
from sklearn.svm import SVC


svm = SVC(kernel='rbf', random_state=1, gamma=0.019, C=10.0)
svm.fit(X_train_std, y_train)

y_pred = svm.predict(X_test_std)
from sklearn.metrics import accuracy_score
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
from sklearn.metrics import log_loss
print('Log Loss: %.2f' % log_loss(y_test, y_pred))


Accuracy: 0.84
Log Loss: 5.40


# Descision Tree usando std

In [16]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(criterion='gini',
                              max_depth=6,
                              random_state=1)
tree.fit(X_train, y_train)

y_pred = tree.predict(X_test_std)
from sklearn.metrics import accuracy_score
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
from sklearn.metrics import log_loss
print('Log Loss: %.2f' % log_loss(y_test, y_pred))

Accuracy: 0.75
Log Loss: 8.70


# MLP Usando std

In [17]:
from sklearn.neural_network import MLPClassifier

start_time = time.time()

clf = MLPClassifier(solver='adam', alpha=0.9, random_state=1,
                verbose=False, max_iter=1000, shuffle= False)
clf.fit(X_train_std, y_train)

print("--- %s Segundos ---" % (time.time() - start_time))

y_pred = clf.predict(X_test_std)
from sklearn.metrics import accuracy_score
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
from sklearn.metrics import log_loss
print('Log Loss: %.2f' % log_loss(y_test, y_pred))

--- 0.5670127868652344 Segundos ---
Accuracy: 0.84
Log Loss: 5.58


# MLP usando pipeline com PCA e grid search

In [18]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
pipe_svc = make_pipeline(StandardScaler(),
                         PCA(n_components=10),
                         MLPClassifier(solver='adam', random_state=1))
param_grid = [
              {'mlpclassifier__alpha': [0.1, 0.5, 1],
               'mlpclassifier__momentum': [0.1, 0.5, 0.9],
               'mlpclassifier__max_iter': [500, 1000]}]
gs = GridSearchCV(estimator=pipe_svc,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=10,
                  n_jobs=4)
start_time = time.time()
gs = gs.fit(X_train, y_train)
print("--- %s Segundos ---" % (time.time() - start_time))
print(gs.best_score_)
print(gs.best_params_)

y_pred = gs.predict(X_test)
from sklearn.metrics import accuracy_score
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
from sklearn.metrics import log_loss
print('Log Loss: %.2f' % log_loss(y_test, y_pred))

--- 106.58850073814392 Segundos ---
0.826666666667
{'mlpclassifier__alpha': 1, 'mlpclassifier__max_iter': 500, 'mlpclassifier__momentum': 0.1}
Accuracy: 0.83
Log Loss: 5.80


# SVM usnado pipeline e grid search

In [19]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
pipe_svc = make_pipeline(StandardScaler(),
                        SVC(random_state=1))
param_range = [0.0001, 0.001, 0.01, 0.1,
               1.0]
param_grid = [{'svc__C': param_range,
               'svc__kernel': ['linear']},
              {'svc__C': [10],
               'svc__gamma': param_range,
               'svc__kernel': ['rbf']}]
gs = GridSearchCV(estimator=pipe_svc,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=10,
                  n_jobs=4)
start_time = time.time()
gs = gs.fit(X_train, y_train)
print("--- %s Segundos ---" % (time.time() - start_time))
print(gs.best_score_)
print(gs.best_params_)

y_pred = gs.predict(X_test)
from sklearn.metrics import accuracy_score
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
from sklearn.metrics import log_loss
print('Log Loss: %.2f' % log_loss(y_test, y_pred))


--- 24.028803825378418 Segundos ---
0.825536723164
{'svc__C': 10, 'svc__gamma': 0.01, 'svc__kernel': 'rbf'}
Accuracy: 0.83
Log Loss: 5.77


# Outro SVM com pipeline e Grid Search

In [20]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
pipe_svc = make_pipeline(StandardScaler(),
                        SVC(random_state=1))
param_range = [
               0.018, 0.019, 0.02, 0.021, 0.022, 0.023, 0.024, 0.025
               ]
c_param_range = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]
param_grid = [
              {'svc__C': c_param_range,
               'svc__gamma': param_range,
               'svc__kernel': ['rbf']}]
gs = GridSearchCV(estimator=pipe_svc,
                  param_grid=param_grid,
                  scoring='accuracy',
                
                  n_jobs=4)
start_time = time.time()
gs = gs.fit(X_train, y_train)
print("--- %s Segundos ---" % (time.time() - start_time))
print(gs.best_score_)
print(gs.best_params_)

y_pred = gs.predict(X_test)
from sklearn.metrics import accuracy_score
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
from sklearn.metrics import log_loss
print('Log Loss: %.2f' % log_loss(y_test, y_pred))

--- 60.23681640625 Segundos ---
0.828248587571
{'svc__C': 14, 'svc__gamma': 0.02, 'svc__kernel': 'rbf'}
Accuracy: 0.84
Log Loss: 5.40


# Comparação de Vários Classificadores ao mesmo Tempo

## Baseado em http://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html#sphx-glr-auto-examples-classification-plot-classifier-comparison-py

In [21]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

h = .02  # step size in the mesh

names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", 
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "QDA"]
"""   "Gaussian Process", GaussianProcessClassifier(1.0 * RBF(1.0)),"""
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]

# iterate over classifiers
for name, clf in zip(names, classifiers):
    start_time = time.time()
    clf.fit(X_train_std, y_train)
    print(name)
    print("   ", "--- %s Segundos ---" % (time.time() - start_time))
    y_pred = clf.predict(X_test_std)
    from sklearn.metrics import accuracy_score
    print("   ", 'Accuracy: %.2f' % accuracy_score(y_test, y_pred))
    from sklearn.metrics import log_loss
    print("   ", 'Log Loss: %.2f' % log_loss(y_test, y_pred))

Nearest Neighbors
    --- 0.018294095993041992 Segundos ---
    Accuracy: 0.79
    Log Loss: 7.18
Linear SVM
    --- 0.19736409187316895 Segundos ---
    Accuracy: 0.82
    Log Loss: 6.12
RBF SVM
    --- 0.7928297519683838 Segundos ---
    Accuracy: 0.77
    Log Loss: 8.11
Decision Tree
    --- 0.0039556026458740234 Segundos ---
    Accuracy: 0.83
    Log Loss: 5.71
Random Forest
    --- 0.012891769409179688 Segundos ---
    Accuracy: 0.82
    Log Loss: 6.12
Neural Net
    --- 0.651043176651001 Segundos ---
    Accuracy: 0.84
    Log Loss: 5.55
AdaBoost
    --- 0.11301350593566895 Segundos ---
    Accuracy: 0.84
    Log Loss: 5.58
Naive Bayes
    --- 0.0013251304626464844 Segundos ---
    Accuracy: 0.82
    Log Loss: 6.15
QDA
    --- 0.0019719600677490234 Segundos ---
    Accuracy: 0.82
    Log Loss: 6.12


# Tentativa com AdaBoost

In [41]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
pipe = make_pipeline(StandardScaler(),
                        AdaBoostClassifier(random_state=1))
param_grid = [
              {
                  'adaboostclassifier__n_estimators': [50, 100, 200, 300], 
                  'adaboostclassifier__learning_rate': [0.1, 0.5, 1, 1.5, 1.7, 1.8, 1.9, 2],
                  'adaboostclassifier__algorithm': ['SAMME', 'SAMME.R']
                    
              }]
gs = GridSearchCV(estimator=pipe,
                  param_grid=param_grid,
                  scoring='accuracy',
                
                  n_jobs=4)
start_time = time.time()
gs = gs.fit(X_train, y_train)
print("--- %s Segundos ---" % (time.time() - start_time))
print(gs.best_score_)
print(gs.best_params_)

y_pred = gs.predict(X_test)
from sklearn.metrics import accuracy_score
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
from sklearn.metrics import log_loss
print('Log Loss: %.2f' % log_loss(y_test, y_pred))

--- 28.3014178276062 Segundos ---
0.860790960452
{'adaboostclassifier__algorithm': 'SAMME.R', 'adaboostclassifier__learning_rate': 1.7, 'adaboostclassifier__n_estimators': 300}
Accuracy: 0.85
Log Loss: 5.18
