# <font color='blue'>Data Science Academy - Machine Learning</font>

# <font color='blue'>Capítulo 6 - Otimização dos Parâmetros com Randomized Search</font>

****** Este Jupyter Notebook foi atualizado para a versão 3.6.1. da Linguagem Python em 05/07/2017 ******

## Extremely Randomized Forest

In [None]:
# Abrir um prompt ou terminal e executar o comando abaixo (responder yes quando solicitado). Reiniciar o Kernel do Jupyter Notebook
# conda update scikit-learn

In [4]:
import sklearn as sl
sl.__version__

'0.19.1'

In [5]:
import pandas as pd
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
#from sklearn.cross_validation import cross_val_score
from sklearn.model_selection import cross_val_score
#from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_digits
from sklearn.preprocessing import scale
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [6]:
# Carrega o dataset
data = pd.read_excel('credit.xls', skiprows = 1)

# Variável target
target = 'default payment next month'
y = np.asarray(data[target])

# Variáveis preditoras
features = data.columns.drop(['ID', target])
X = np.asarray(data[features])

# Dataset de treino e de teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 99)

# Classificador
clf = ExtraTreesClassifier(n_estimators = 500, random_state = 99)

# Modelo
clf.fit(X_train, y_train)

# Score
scores = cross_val_score(clf, X_train, y_train, cv = 3, scoring = 'accuracy', n_jobs = -1)

# Imprimindo o resultado
print ("ExtraTreesClassifier -> Acurácia: Média = %0.3f Desvio Padrão = %0.3f" % (np.mean(scores), np.std(scores)))

# Fazendo previsões
y_pred = clf.predict(X_test)

# Confusion Matrix
confusionMatrix = confusion_matrix(y_test, y_pred)
print (confusionMatrix)

# Acurácia
accuracy_score(y_test, y_pred)


ExtraTreesClassifier -> Acurácia: Média = 0.812 Desvio Padrão = 0.002
[[6532  446]
 [1273  749]]


0.80900000000000005

## Otimização dos Parâmetros com Randomized Search

http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html

O Randomized Search gera amostras dos parâmetros dos algoritmos a partir de uma distribuição randômica uniforme para um número fixo de interações. Um modelo é construído e testado para cada combinação de parâmetros. 

In [7]:
# Import
from sklearn.model_selection import RandomizedSearchCV

In [8]:
# Definição dos parâmetros
param_dist = {"max_depth": [1, 3, 7, 8, 12, None],
              "max_features": [8, 9, 10, 11, 16, 22],
              "min_samples_split": [8, 10, 11, 14, 16, 19],
              "min_samples_leaf": [1, 2, 3, 4, 5, 6, 7],
              "bootstrap": [True, False]}

# Para o classificador criado na célula anterior, testamos diferentes combinações de parâmetros
rsearch = RandomizedSearchCV(clf, param_distributions = param_dist, n_iter = 25)  

# Aplicando o resultado ao conjunto de dados de treino e obtendo o score
rsearch.fit(X_train,y_train)
rsearch.grid_scores_

# Imprimindo o melhor estimador
bestclf = rsearch.best_estimator_
print (bestclf)

# Aplicando o melhor estimador para realizar as previsões
y_pred = bestclf.predict(X_test)

# Confusion Matrix
confusionMatrix = confusion_matrix(y_test, y_pred)
print(confusionMatrix)

# Acurácia
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)



ExtraTreesClassifier(bootstrap=True, class_weight=None, criterion='gini',
           max_depth=7, max_features=16, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=6, min_samples_split=11,
           min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
           oob_score=False, random_state=99, verbose=0, warm_start=False)
[[6649  329]
 [1285  737]]
0.820666666667


In [None]:
# Obtendo o grid com todas as combinações de parâmetros
rsearch.grid_scores_

## Grid Search x Randomized Search para Estimação dos Hiperparâmetros

http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html

O Grid Search realiza metodicamente combinações entre todos os parâmetros do algoritmo, criando um grid. 

In [9]:
import numpy as np
from time import time
from scipy.stats import randint as sp_randint
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import load_digits

# Obtém o dataset
digits = load_digits()
X, y = digits.data, digits.target

# Construindo o classificador
clf = RandomForestClassifier(n_estimators = 20)

In [10]:
# Randomized Search

# Valores dos parâmetros que serão testados
param_dist = {"max_depth": [3, None],
              "max_features": sp_randint(1, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# Executando o Randomized Search
n_iter_search = 20
random_search = RandomizedSearchCV(clf, param_distributions = param_dist, n_iter = n_iter_search)

start = time()
random_search.fit(X, y)
print("RandomizedSearchCV executou em %.2f segundos para %d candidatos a parâmetros do modelo." 
      % ((time() - start), n_iter_search))

# Imprime as combinações dos parâmetros e susas respectivas médias de acurácia
random_search.grid_scores_

RandomizedSearchCV executou em 9.77 segundos para 20 candidatos a parâmetros do modelo.




[mean: 0.81747, std: 0.02540, params: {'bootstrap': True, 'criterion': 'gini', 'max_depth': 3, 'max_features': 9, 'min_samples_leaf': 1},
 mean: 0.81247, std: 0.02371, params: {'bootstrap': True, 'criterion': 'gini', 'max_depth': 3, 'max_features': 6, 'min_samples_leaf': 4},
 mean: 0.92432, std: 0.00643, params: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': None, 'max_features': 4, 'min_samples_leaf': 3},
 mean: 0.82193, std: 0.01151, params: {'bootstrap': True, 'criterion': 'gini', 'max_depth': None, 'max_features': 1, 'min_samples_leaf': 10},
 mean: 0.91319, std: 0.01129, params: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': None, 'max_features': 6, 'min_samples_leaf': 8},
 mean: 0.80412, std: 0.01100, params: {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 3, 'max_features': 4, 'min_samples_leaf': 1},
 mean: 0.90929, std: 0.00319, params: {'bootstrap': False, 'criterion': 'gini', 'max_depth': None, 'max_features': 6, 'min_samples_leaf': 10},
 mean: 0

In [11]:
# Grid Search

# Usando um grid completo de todos os parâmetros
param_grid = {"max_depth": [3, None],
              "max_features": [1, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# Executando o Grid Search
grid_search = GridSearchCV(clf, param_grid = param_grid)
start = time()
grid_search.fit(X, y)

print("GridSearchCV executou em %.2f segundos para todas as combinações de candidatos a parâmetros do modelo."
      % (time() - start))
grid_search.grid_scores_

GridSearchCV executou em 32.73 segundos para todas as combinações de candidatos a parâmetros do modelo.




[mean: 0.72788, std: 0.01464, params: {'bootstrap': True, 'criterion': 'gini', 'max_depth': 3, 'max_features': 1, 'min_samples_leaf': 1},
 mean: 0.78019, std: 0.02424, params: {'bootstrap': True, 'criterion': 'gini', 'max_depth': 3, 'max_features': 1, 'min_samples_leaf': 3},
 mean: 0.77240, std: 0.05540, params: {'bootstrap': True, 'criterion': 'gini', 'max_depth': 3, 'max_features': 1, 'min_samples_leaf': 10},
 mean: 0.80857, std: 0.02948, params: {'bootstrap': True, 'criterion': 'gini', 'max_depth': 3, 'max_features': 3, 'min_samples_leaf': 1},
 mean: 0.79188, std: 0.00721, params: {'bootstrap': True, 'criterion': 'gini', 'max_depth': 3, 'max_features': 3, 'min_samples_leaf': 3},
 mean: 0.82359, std: 0.01804, params: {'bootstrap': True, 'criterion': 'gini', 'max_depth': 3, 'max_features': 3, 'min_samples_leaf': 10},
 mean: 0.81024, std: 0.02556, params: {'bootstrap': True, 'criterion': 'gini', 'max_depth': 3, 'max_features': 10, 'min_samples_leaf': 1},
 mean: 0.81803, std: 0.01929, p

In [12]:
grid_search.cv_results_



{'mean_fit_time': array([ 0.07465299,  0.08824809,  0.11417476,  0.09720635,  0.12545204,
         0.11705534,  0.11880302,  0.12068423,  0.16263596,  0.1139826 ,
         0.12444997,  0.10081935,  0.10046045,  0.13650513,  0.0982097 ,
         0.17495131,  0.13918106,  0.15863967,  0.08727956,  0.0963575 ,
         0.09996088,  0.07891138,  0.10383391,  0.09927766,  0.14338438,
         0.09629067,  0.09931763,  0.13691115,  0.11967333,  0.08280977,
         0.12686849,  0.10199722,  0.1043299 ,  0.23052796,  0.18804534,
         0.15991863,  0.07048504,  0.09354305,  0.10332338,  0.08829665,
         0.10607799,  0.10816042,  0.12727841,  0.10473831,  0.10531044,
         0.13400873,  0.10780366,  0.10511891,  0.11855086,  0.12071474,
         0.10559066,  0.21064782,  0.17499876,  0.18349163,  0.08021768,
         0.09071962,  0.07138419,  0.09268467,  0.07907629,  0.07936637,
         0.10944096,  0.13968658,  0.1253744 ,  0.13751531,  0.11071698,
         0.1106379 ,  0.14653611, 

In [13]:
df_results = pd.DataFrame(grid_search.cv_results_)



In [14]:
df_results

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_bootstrap,param_criterion,param_max_depth,param_max_features,param_min_samples_leaf,params,...,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,0.074653,0.009253,0.727880,0.817729,True,gini,3,1,1,"{'bootstrap': True, 'criterion': 'gini', 'max_...",...,0.707641,0.811715,0.734558,0.803005,0.741611,0.838468,0.003225,0.000850,0.014650,0.015089
1,0.088248,0.008724,0.780189,0.840010,True,gini,3,1,3,"{'bootstrap': True, 'criterion': 'gini', 'max_...",...,0.747508,0.841841,0.787980,0.834725,0.805369,0.843464,0.009646,0.005790,0.024255,0.003795
2,0.114175,0.012993,0.772398,0.823927,True,gini,3,1,10,"{'bootstrap': True, 'criterion': 'gini', 'max_...",...,0.820598,0.837657,0.801336,0.861436,0.694631,0.772689,0.005350,0.007501,0.055346,0.037509
3,0.097206,0.005228,0.808570,0.868146,True,gini,3,3,1,"{'bootstrap': True, 'criterion': 'gini', 'max_...",...,0.828904,0.880335,0.829716,0.883139,0.766779,0.840966,0.001800,0.000899,0.029442,0.019254
4,0.125452,0.005327,0.791875,0.851963,True,gini,3,3,3,"{'bootstrap': True, 'criterion': 'gini', 'max_...",...,0.785714,0.837657,0.787980,0.865609,0.802013,0.852623,0.038727,0.000638,0.007202,0.011421
5,0.117055,0.016266,0.823595,0.882579,True,gini,3,3,10,"{'bootstrap': True, 'criterion': 'gini', 'max_...",...,0.848837,0.889540,0.808013,0.864775,0.813758,0.893422,0.049102,0.012625,0.018069,0.012689
6,0.118803,0.013989,0.810239,0.879254,True,gini,3,10,1,"{'bootstrap': True, 'criterion': 'gini', 'max_...",...,0.835548,0.880335,0.819699,0.889816,0.775168,0.867610,0.020162,0.006631,0.025541,0.009098
7,0.120684,0.008681,0.818030,0.872559,True,gini,3,10,3,"{'bootstrap': True, 'criterion': 'gini', 'max_...",...,0.823920,0.871967,0.838063,0.865609,0.791946,0.880100,0.025801,0.003396,0.019263,0.005931
8,0.162636,0.006864,0.808013,0.864211,True,gini,3,10,10,"{'bootstrap': True, 'criterion': 'gini', 'max_...",...,0.823920,0.863598,0.791319,0.857262,0.808725,0.871774,0.021501,0.000994,0.013336,0.005940
9,0.113983,0.006228,0.892599,1.000000,True,gini,,1,1,"{'bootstrap': True, 'criterion': 'gini', 'max_...",...,0.888704,1.000000,0.888147,1.000000,0.901007,1.000000,0.016305,0.001056,0.005927,0.000000


### Fim

### Obrigado - Data Science Academy - <a href=http://facebook.com/dsacademy>facebook.com/dsacademybr</a>