# Otimização dos melhores parâmetros usando crosvalidation

In [60]:
# bibliotecas
from sklearn.metrics import log_loss, accuracy_score
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from scipy.stats import randint
from sklearn.model_selection import StratifiedKFold, ParameterGrid, ParameterSampler, GridSearchCV, RandomizedSearchCV, train_test_split

In [61]:
# dados 
df_diabetes = pd.read_csv('https://raw.githubusercontent.com/abnr/ml-data/main/diabetes.csv')

In [62]:
df_diabetes

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,tested_positive
1,1,85,66,29,0,26.6,0.351,31,tested_negative
2,8,183,64,0,0,23.3,0.672,32,tested_positive
3,1,89,66,23,94,28.1,0.167,21,tested_negative
4,0,137,40,35,168,43.1,2.288,33,tested_positive
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,tested_negative
764,2,122,70,27,0,36.8,0.340,27,tested_negative
765,5,121,72,23,112,26.2,0.245,30,tested_negative
766,1,126,60,0,0,30.1,0.349,47,tested_positive


In [63]:
# Tranformando as variáveis categóricas
df_diabetes['class'] = df_diabetes['class'].map({'tested_positive': 1, 'tested_negative':0})
df_diabetes.head(5)

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [64]:
#Train Test split
df_train, df_test = train_test_split(df_diabetes, stratify=df_diabetes['class'], test_size=0.15, random_state=123)

X_train_cv, y_train_cv = df_train.drop('class', axis = 1), df_train['class']
X_test_cv, y_test_cv = df_test.drop('class', axis = 1), df_test['class']

In [65]:
#Pipeline
pipe_knn = Pipeline(steps=[('pre_processing', MinMaxScaler()), ('model', KNeighborsClassifier())])
pipe_knn


Pipeline(steps=[('pre_processing', MinMaxScaler()),
                ('model', KNeighborsClassifier())])

In [66]:
# método de crossvalidation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=123)

## Grid Search

In [67]:
# Definição da grid para o grid search
param_grid = dict()
param_grid['model__metric'] = ['minkowski', 'euclidean', 'manhattan']
param_grid['model__n_neighbors'] = list(range(1,31))
param_grid['model__weights'] = ['uniform', 'distance']

In [68]:
grid_search = GridSearchCV(pipe_knn, param_grid, cv=skf, scoring='neg_log_loss')

In [69]:
grid_search.fit(X_train_cv, y_train_cv)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=123, shuffle=True),
             estimator=Pipeline(steps=[('pre_processing', MinMaxScaler()),
                                       ('model', KNeighborsClassifier())]),
             param_grid={'model__metric': ['minkowski', 'euclidean',
                                           'manhattan'],
                         'model__n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
                                                11, 12, 13, 14, 15, 16, 17, 18,
                                                19, 20, 21, 22, 23, 24, 25, 26,
                                                27, 28, 29, 30],
                         'model__weights': ['uniform', 'distance']},
             scoring='neg_log_loss')

In [70]:
grid_search.best_score_

-0.6016263310913537

In [71]:
#Melhores parâmetros
grid_search.best_params_

{'model__metric': 'minkowski',
 'model__n_neighbors': 30,
 'model__weights': 'distance'}

In [72]:
pipe_knn_grid_search = grid_search.best_estimator_
pipe_knn_grid_search 

Pipeline(steps=[('pre_processing', MinMaxScaler()),
                ('model',
                 KNeighborsClassifier(n_neighbors=30, weights='distance'))])

In [73]:
#Performance no treino
log_loss(y_train_cv, pipe_knn_grid_search.predict_proba(X_train_cv))

9.992007221626413e-16

In [74]:
#Performance no teste
log_loss(y_test_cv, pipe_knn_grid_search.predict_proba(X_test_cv))

0.47427429851146835

In [75]:
#Método random search

In [76]:
# Definição da grid para o random search
param_random = dict()
param_random['model__metric'] = ['minkowski', 'euclidean', 'manhattan']
param_random['model__n_neighbors'] = randint(1,51)
param_random['model__weights'] = ['uniform', 'distance']
param_random

{'model__metric': ['minkowski', 'euclidean', 'manhattan'],
 'model__n_neighbors': <scipy.stats._distn_infrastructure.rv_frozen at 0x1dc184436c8>,
 'model__weights': ['uniform', 'distance']}

In [77]:
random_search = RandomizedSearchCV(pipe_knn, param_random, cv=skf, scoring='neg_log_loss', return_train_score=True, n_iter = 50, random_state = 123)

In [78]:
random_search.fit(X_train_cv, y_train_cv)

RandomizedSearchCV(cv=StratifiedKFold(n_splits=10, random_state=123, shuffle=True),
                   estimator=Pipeline(steps=[('pre_processing', MinMaxScaler()),
                                             ('model',
                                              KNeighborsClassifier())]),
                   n_iter=50,
                   param_distributions={'model__metric': ['minkowski',
                                                          'euclidean',
                                                          'manhattan'],
                                        'model__n_neighbors': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001DC184436C8>,
                                        'model__weights': ['uniform',
                                                           'distance']},
                   random_state=123, return_train_score=True,
                   scoring='neg_log_loss')

In [79]:
random_search.best_score_

-0.5101715402858514

In [80]:
random_search.best_params_

{'model__metric': 'manhattan',
 'model__n_neighbors': 46,
 'model__weights': 'uniform'}

In [81]:
pipe_knn_random_search = random_search.best_estimator_
pipe_knn_random_search 

Pipeline(steps=[('pre_processing', MinMaxScaler()),
                ('model',
                 KNeighborsClassifier(metric='manhattan', n_neighbors=46))])

In [82]:
#Performance no treino
log_loss(y_train_cv, pipe_knn_random_search.predict_proba(X_train_cv))

0.49196942503329305

In [83]:
#Performance no teste
log_loss(y_test_cv, pipe_knn_random_search.predict_proba(X_test_cv))

0.46797691683589904