# <font color='blue'>Data Science Academy - Machine Learning</font>

# <font color='blue'>Capítulo 6 - Otimização dos Parâmetros com Randomized Search</font>

## Extremely Randomized Forest

In [1]:
# Abrir um prompt ou terminal e executar o comando abaixo (responder yes quando solicitado). Reiniciar o Kernel do Jupyter Notebook
# conda update scikit-learn

In [2]:
import sklearn as sl
sl.__version__

'0.19.1'

In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_digits
from sklearn.preprocessing import scale
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

# Carrega o dataset
data = pd.read_excel('credit.xls', skiprows = 1)

# Variável target
target = 'default payment next month'
y = np.asarray(data[target])

# Variáveis preditoras
features = data.columns.drop(['ID', target])
X = np.asarray(data[features])

# Dataset de treino e de teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 99)

# Classificador
clf = ExtraTreesClassifier(n_estimators = 500, random_state = 99)

# Modelo
clf.fit(X_train, y_train)

# Score
scores = cross_val_score(clf, X_train, y_train, cv = 3, scoring = 'accuracy', n_jobs = -1)

# Imprimindo o resultado
print ("ExtraTreesClassifier -> Acurácia: Média = %0.3f Desvio Padrão = %0.3f" % (np.mean(scores), np.std(scores)))

# Fazendo previsões
y_pred = clf.predict(X_test)

# Confusion Matrix
confusionMatrix = confusion_matrix(y_test, y_pred)
print (confusionMatrix)

# Acurácia
accuracy_score(y_test, y_pred)


ExtraTreesClassifier -> Acurácia: Média = 0.812 Desvio Padrão = 0.002
[[6532  446]
 [1273  749]]


0.809

## Otimização dos Parâmetros com Randomized Search

http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html

O Randomized Search gera amostras dos parâmetros dos algoritmos a partir de uma distribuição randômica uniforme para um número fixo de interações. Um modelo é construído e testado para cada combinação de parâmetros. 

In [4]:
# Import
from sklearn.model_selection import RandomizedSearchCV

In [5]:
# Definição dos parâmetros
param_dist = {"max_depth": [1, 3, 7, 8, 12, None],
              "max_features": [8, 9, 10, 11, 16, 22],
              "min_samples_split": [8, 10, 11, 14, 16, 19],
              "min_samples_leaf": [1, 2, 3, 4, 5, 6, 7],
              "bootstrap": [True, False]}

# Para o classificador criado na célula anterior, testamos diferentes combinações de parâmetros
rsearch = RandomizedSearchCV(clf, param_distributions = param_dist, n_iter = 25,return_train_score=True)  

# Aplicando o resultado ao conjunto de dados de treino e obtendo o score
rsearch.fit(X_train,y_train)

### ALTERACAO PARA cv_results_
rsearch.cv_results_

# Imprimindo o melhor estimador
bestclf = rsearch.best_estimator_
print (bestclf)

# Aplicando o melhor estimador para realizar as previsões
y_pred = bestclf.predict(X_test)

# Confusion Matrix
confusionMatrix = confusion_matrix(y_test, y_pred)
print(confusionMatrix)

# Acurácia
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

ExtraTreesClassifier(bootstrap=True, class_weight=None, criterion='gini',
           max_depth=7, max_features=16, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=5, min_samples_split=10,
           min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
           oob_score=False, random_state=99, verbose=0, warm_start=False)
[[6648  330]
 [1282  740]]
0.8208888888888889


In [6]:
# Obtendo o grid com todas as combinações de parâmetros
rsearch.cv_results_

{'mean_fit_time': array([3.29193751, 2.85922114, 3.3010966 , 2.47661034, 3.2055877 ,
        5.36168869, 3.06354372, 1.59435805, 9.18926835, 7.60961016,
        4.37964145, 2.38128471, 6.71382157, 8.9254969 , 7.27037541,
        1.37884164, 1.36238233, 1.50590301, 7.16070239, 4.52097273,
        3.70752239, 1.40684374, 3.43521261, 2.40370456, 1.07045563]),
 'std_fit_time': array([0.06247686, 0.03698078, 0.03636771, 0.01724194, 0.03250518,
        0.03238586, 0.03240895, 0.02628069, 0.37217858, 0.36370598,
        0.1034462 , 0.04075574, 0.415562  , 0.77506886, 0.11254875,
        0.00327235, 0.0096132 , 0.05576319, 0.09591034, 0.0965629 ,
        0.09589124, 0.024081  , 0.04928218, 0.06192724, 0.01853477]),
 'mean_score_time': array([0.3298227 , 0.24607197, 0.22944768, 0.22220858, 0.24171853,
        0.56361159, 0.22161396, 0.15408762, 0.34588925, 0.58042844,
        0.33878636, 0.21993033, 0.36393404, 0.48102037, 0.54222409,
        0.15266681, 0.15344659, 0.15151008, 0.51952426, 0.24

## Grid Search x Randomized Search para Estimação dos Hiperparâmetros

http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html

O Grid Search realiza metodicamente combinações entre todos os parâmetros do algoritmo, criando um grid. 

In [7]:
import numpy as np
from time import time
from scipy.stats import randint as sp_randint
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import load_digits

# Obtém o dataset
digits = load_digits()
X, y = digits.data, digits.target

# Construindo o classificador
clf = RandomForestClassifier(n_estimators = 20)

In [8]:
# Randomized Search

# Valores dos parâmetros que serão testados
param_dist = {"max_depth": [3, None],
              "max_features": sp_randint(1, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# Executando o Randomized Search
n_iter_search = 20
random_search = RandomizedSearchCV(clf, 
                                   param_distributions = param_dist, 
                                   n_iter = n_iter_search,
                                   return_train_score=True)

start = time()
random_search.fit(X, y)
print("RandomizedSearchCV executou em %.2f segundos para %d candidatos a parâmetros do modelo." 
      % ((time() - start), n_iter_search))

# Imprime as combinações dos parâmetros e susas respectivas médias de acurácia
random_search.cv_results_

RandomizedSearchCV executou em 2.41 segundos para 20 candidatos a parâmetros do modelo.


{'mean_fit_time': array([0.025997  , 0.01895261, 0.04454859, 0.03694304, 0.02779222,
        0.04843569, 0.02788146, 0.02397148, 0.04269894, 0.02700305,
        0.02497411, 0.02621333, 0.05310337, 0.03996555, 0.01983738,
        0.02846948, 0.02138734, 0.02985454, 0.03924926, 0.03750134]),
 'std_fit_time': array([1.85677947e-03, 2.09600534e-05, 3.80369976e-03, 1.10094156e-03,
        4.87736627e-04, 5.67221634e-04, 1.33355995e-03, 1.53452700e-03,
        4.53628145e-04, 3.87492077e-04, 6.64855448e-04, 2.12620216e-04,
        9.07902297e-04, 5.77102479e-04, 6.23293231e-05, 2.46241523e-04,
        6.79514784e-04, 6.39299966e-04, 4.74018841e-04, 2.61478776e-04]),
 'mean_score_time': array([0.00212097, 0.0021441 , 0.00305549, 0.00272965, 0.00273379,
        0.00263031, 0.00293668, 0.0023191 , 0.00292031, 0.0023044 ,
        0.00233976, 0.00214895, 0.00267529, 0.00294749, 0.00199493,
        0.00222349, 0.00210849, 0.00270677, 0.0029285 , 0.00271932]),
 'std_score_time': array([6.99362068e-

In [9]:
# Grid Search

# Usando um grid completo de todos os parâmetros
param_grid = {"max_depth": [3, None],
              "max_features": [1, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# Executando o Grid Search
grid_search = GridSearchCV(clf, param_grid = param_grid,return_train_score=True)
start = time()
grid_search.fit(X, y)

print("GridSearchCV executou em %.2f segundos para todas as combinações de candidatos a parâmetros do modelo."
      % (time() - start))
grid_search.cv_results_

GridSearchCV executou em 8.12 segundos para todas as combinações de candidatos a parâmetros do modelo.


{'mean_fit_time': array([0.02213097, 0.02012539, 0.01995786, 0.02182643, 0.02129404,
        0.02104203, 0.02608569, 0.02680564, 0.02644976, 0.02633532,
        0.02449195, 0.02380371, 0.03284423, 0.02901332, 0.02707124,
        0.04441253, 0.04130101, 0.03762666, 0.02146371, 0.02194699,
        0.01996875, 0.02333681, 0.02206421, 0.02241365, 0.02925666,
        0.02901014, 0.02981242, 0.03129538, 0.02481103, 0.02272852,
        0.0363876 , 0.0318466 , 0.02889625, 0.05388204, 0.04913823,
        0.04254039, 0.01939599, 0.01916567, 0.0191075 , 0.0217607 ,
        0.02163339, 0.02203409, 0.02977689, 0.03015606, 0.03008763,
        0.02986264, 0.02491315, 0.02156274, 0.03587437, 0.033132  ,
        0.02908405, 0.05806271, 0.06475592, 0.04742972, 0.01979065,
        0.01940823, 0.01972707, 0.02311095, 0.022753  , 0.02304657,
        0.03239171, 0.03270674, 0.03212253, 0.03470929, 0.02644038,
        0.02243884, 0.04171546, 0.03639539, 0.03174829, 0.06781006,
        0.06177402, 0.05251861]

In [10]:
grid_search.cv_results_

{'mean_fit_time': array([0.02213097, 0.02012539, 0.01995786, 0.02182643, 0.02129404,
        0.02104203, 0.02608569, 0.02680564, 0.02644976, 0.02633532,
        0.02449195, 0.02380371, 0.03284423, 0.02901332, 0.02707124,
        0.04441253, 0.04130101, 0.03762666, 0.02146371, 0.02194699,
        0.01996875, 0.02333681, 0.02206421, 0.02241365, 0.02925666,
        0.02901014, 0.02981242, 0.03129538, 0.02481103, 0.02272852,
        0.0363876 , 0.0318466 , 0.02889625, 0.05388204, 0.04913823,
        0.04254039, 0.01939599, 0.01916567, 0.0191075 , 0.0217607 ,
        0.02163339, 0.02203409, 0.02977689, 0.03015606, 0.03008763,
        0.02986264, 0.02491315, 0.02156274, 0.03587437, 0.033132  ,
        0.02908405, 0.05806271, 0.06475592, 0.04742972, 0.01979065,
        0.01940823, 0.01972707, 0.02311095, 0.022753  , 0.02304657,
        0.03239171, 0.03270674, 0.03212253, 0.03470929, 0.02644038,
        0.02243884, 0.04171546, 0.03639539, 0.03174829, 0.06781006,
        0.06177402, 0.05251861]

### Fim

### Obrigado - Data Science Academy - <a href="http://facebook.com/dsacademybr">facebook.com/dsacademybr</a>