# Importando Bibliotecas

In [63]:
import pickle
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, KFold

# Avaliação dos Algoritimos
- Naïve Bayes: 93.80
- Árvore de Decisão: 98.20
- Regras: 97.40
- Regressão Logística: 94.60
- SVM: 98.80
- Redes Neurais: 99.60

## Turning dos Parâmetros com GridSearch

### Preparação dos Dados

In [25]:
with open('credit.pkl', 'rb') as f:
    x_credit_treinamento, y_credit_treinamento, x_credit_teste, y_credit_teste = pickle.load(f)

In [26]:
x_credit_treinamento.shape, y_credit_treinamento.shape, x_credit_teste.shape, y_credit_teste.shape

((1500, 3), (1500,), (500, 3), (500,))

In [27]:
x_credit = np.concatenate((x_credit_treinamento, x_credit_teste), axis = 0)

In [28]:
x_credit.shape

(2000, 3)

In [29]:
y_credit = np.concatenate((y_credit_treinamento,y_credit_teste), axis = 0)

In [30]:
y_credit

array([0, 0, 0, ..., 0, 1, 1], dtype=int64)

### Árvore de Decisão

In [31]:
DecisionTreeClassifier()

In [32]:
parametros = {'criterion' : ['gini', 'entropy'],
              'splitter': ['best', 'random'],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 5, 10] }

In [33]:
grid_search = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=parametros)
grid_search.fit(x_credit, y_credit)
melhores_parametros = grid_search.best_params_
melhores_resultados = grid_search.best_score_
print(melhores_parametros)
print(melhores_resultados)

{'criterion': 'entropy', 'min_samples_leaf': 1, 'min_samples_split': 5, 'splitter': 'best'}
0.983


### Random Forest 

In [34]:
parametros = {'criterion' : ['gini', 'entropy'],
              'n_estimators': [10, 40, 100, 500],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 5, 10] }

In [35]:
grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=parametros)
grid_search.fit(x_credit, y_credit)
melhores_parametros = grid_search.best_params_
melhores_resultados = grid_search.best_score_
print(melhores_parametros)
print(melhores_resultados)

{'criterion': 'entropy', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 40}
0.9870000000000001


### Knn

In [36]:
parametros = {'n_neighbors' : [3, 5, 10, 20],
              'p': [1,2]}

In [37]:
grid_search = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=parametros)
grid_search.fit(x_credit, y_credit)
melhores_parametros = grid_search.best_params_
melhores_resultados = grid_search.best_score_
print(melhores_parametros)
print(melhores_resultados)

{'n_neighbors': 20, 'p': 1}
0.9800000000000001


### Regressão Logística

In [40]:
parametros = {'tol' : [0.0001, 0.00001, 0.000001], 
              'C' : [1.0, 1.5, 2.0],
              'solver' : ['lbfgs', 'sag', 'saga']}

In [41]:
grid_search = GridSearchCV(estimator=LogisticRegression(), param_grid=parametros)
grid_search.fit(x_credit, y_credit)
melhores_parametros = grid_search.best_params_
melhores_resultados = grid_search.best_score_
print(melhores_parametros)
print(melhores_resultados)

{'C': 1.0, 'solver': 'lbfgs', 'tol': 0.0001}
0.9484999999999999


### SVM

In [42]:
parametros = {'tol' : [0.001, 0.0001, 0.00001],
              'C' : [1.0, 1.5, 2.0],
              'kernel' : ['rbf', 'linear', 'poly', 'sigmoid']}

In [43]:
grid_search = GridSearchCV(estimator=SVC(), param_grid=parametros)
grid_search.fit(x_credit, y_credit)
melhores_parametros = grid_search.best_params_
melhores_resultados = grid_search.best_score_
print(melhores_parametros)
print(melhores_resultados)

{'C': 1.5, 'kernel': 'rbf', 'tol': 0.001}
0.9829999999999999


### Redes Neurais

In [44]:
parametros = {'activation' :  ['relu', 'logistic', 'tahn'],
              'solver' : ['adam', 'sgd'],
              'batch_size' : [10, 56]}

In [45]:
grid_search = GridSearchCV(estimator=MLPClassifier(), param_grid=parametros)
grid_search.fit(x_credit, y_credit)
melhores_parametros = grid_search.best_params_
melhores_resultados = grid_search.best_score_
print(melhores_parametros)
print(melhores_resultados)

20 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\lucas\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\lucas\anaconda3\lib\site-packages\sklearn\base.py", line 1344, in wrapper
    estimator._validate_params()
  File "c:\Users\lucas\anaconda3\lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\lucas\anaconda3\lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameter

{'activation': 'relu', 'batch_size': 10, 'solver': 'adam'}
0.9970000000000001


### Validação Cruzada

from sklearn.model_selection import cross_val_predict, KFold

In [62]:
resultados_knn = []
resultados_svm = []
resultados_arvore = []
resultados_logistica = []
resultados_rede_neural = []
resultados_random_forest = []

for i in range(30): # é utilizado 30 teste geralmente
    kfold = KFold(n_splits=10, shuffle=True, random_state=i)
    
    arvore = DecisionTreeClassifier(criterion='entropy', min_samples_leaf=1, min_samples_split=5, splitter='best')
    scores = cross_val_score(arvore, x_credit, y_credit, cv = kfold)
    #print(scores) 
    #print(scores.mean())
    resultados_arvore.append(scores.mean())
    
    random_forest = RandomForestClassifier(criterion='entropy', min_samples_leaf=1, min_samples_split=5, n_estimators=10)
    score = cross_val_score(random_forest, x_credit, y_credit, cv=kfold)
    resultados_random_forest.append(score.mean())
    
    knn = KNeighborsClassifier()
    scores = cross_val_score(knn, x_credit, y_credit, cv = kfold)
    resultados_knn.append(scores.mean())
    
    logistica = LogisticRegression(C=1.0, solver='lbfgs', tol=0.0001)
    scores = cross_val_score(logistica, x_credit, y_credit, cv=kfold)
    resultados_logistica.append(scores.mean())
    
    svm = SVC(kernel='rbf', C=2.0)
    scores = cross_val_score(svm, x_credit, y_credit, cv=kfold)
    resultados_svm.append(scores.mean())
    
    rede_neural = MLPClassifier(activation='relu', batch_size=56, solver='adam')
    scores = cross_val_score(rede_neural, x_credit, y_credit, cv=kfold)
    resultados_rede_neural.append(scores.mean())
    



In [59]:
resultados_knn, resultados_svm, resultados_arvore, resultados_logistica, resultados_rede_neural, resultados_random_forest 

[0.9845,
 0.9804999999999999,
 0.9864999999999998,
 0.9834999999999999,
 0.984,
 0.983,
 0.9844999999999999,
 0.9879999999999999,
 0.9814999999999999,
 0.9845,
 0.9834999999999999,
 0.9814999999999999,
 0.985,
 0.983,
 0.985,
 0.9799999999999999,
 0.9810000000000001,
 0.986,
 0.9824999999999999,
 0.985,
 0.984,
 0.9825000000000002,
 0.9814999999999999,
 0.9889999999999999,
 0.9830000000000002,
 0.984,
 0.983,
 0.9810000000000001,
 0.985,
 0.9824999999999999]

In [67]:
resultados = pd.DataFrame({'Arvore': resultados_arvore,
                          'Random Forest' : resultados_random_forest,
                          'KNN' : resultados_knn,
                          'Logistica' : resultados_logistica,
                          'SVM' : resultados_svm,
                          'Rede Neural' : resultados_rede_neural})
resultados

Unnamed: 0,Arvore,Random Forest,KNN,Logistica,SVM,Rede Neural
0,0.986,0.984,0.9815,0.9475,0.9845,0.9965
1,0.985,0.983,0.98,0.9465,0.984,0.9975
2,0.9905,0.983,0.9795,0.947,0.9865,0.997
3,0.9875,0.9805,0.978,0.946,0.985,0.9965
4,0.9885,0.9825,0.982,0.9465,0.985,0.997
5,0.989,0.984,0.978,0.9465,0.9845,0.997
6,0.9885,0.9875,0.9805,0.947,0.986,0.9955
7,0.9875,0.982,0.98,0.948,0.985,0.9975
8,0.9855,0.985,0.9795,0.9465,0.984,0.996
9,0.9875,0.984,0.982,0.9465,0.9845,0.997


In [68]:
resultados.describe()

Unnamed: 0,Arvore,Random Forest,KNN,Logistica,SVM,Rede Neural
count,30.0,30.0,30.0,30.0,30.0,30.0
mean,0.987283,0.983883,0.98005,0.947,0.985083,0.997033
std,0.001612,0.00174,0.001533,0.000743,0.00128,0.000524
min,0.984,0.9805,0.977,0.9455,0.982,0.9955
25%,0.986,0.9825,0.979,0.9465,0.984125,0.996625
50%,0.9875,0.984,0.98,0.947,0.985,0.997
75%,0.9885,0.985375,0.981,0.9475,0.986375,0.9975
max,0.9905,0.9875,0.9825,0.9485,0.9875,0.9975


In [71]:
(resultados.std() / resultados.mean()) * 100

Arvore           0.163259
Random Forest    0.176892
KNN              0.156446
Logistica        0.078435
SVM              0.129977
Rede Neural      0.052569
dtype: float64