## Overview
I am going to create simple example of bayesian optimization by using same dataset from GridSearchKnn. Our target goal would be which k value gives best outcome of test score. 

## Imports

In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier

sns.set()

# set random seed to try make this exercise and solutions reproducible (NB: this is just for teaching purpose and not something you would do in real life)
random_seed_number = 42
np.random.seed(random_seed_number)

## Read data

In [2]:
diabetes_data = pd.read_csv('data/diabetes.csv')
diabetes_data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


## Feature Engineering

In [3]:
str_cols = ['Glucose','BloodPressure','SkinThickness','Insulin','BMI']    # specify columns you want to replace
diabetes_data[str_cols] = diabetes_data[str_cols].replace(0.000000, np.nan)

In [4]:
diabetes_data['Glucose'].fillna(diabetes_data['Glucose'].mean(), inplace = True)
diabetes_data['BloodPressure'].fillna(diabetes_data['BloodPressure'].mean(), inplace = True)
diabetes_data['SkinThickness'].fillna(diabetes_data['SkinThickness'].median(), inplace = True)
diabetes_data['Insulin'].fillna(diabetes_data['Insulin'].median(), inplace = True)
diabetes_data['BMI'].fillna(diabetes_data['BMI'].median(), inplace = True)

## Split train and test

In [5]:
y = diabetes_data['Outcome']
X = diabetes_data.loc[:, diabetes_data.columns != 'Outcome']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=0)

## Scale data

In [7]:
# define scaler
scaler = StandardScaler()

# fit scale_df
scaler.fit(X_train)

# transform the data using fitted scaler
#scaled_df = scaler.transform(X_test)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## Bayesian Optimization

In [17]:
from bayes_opt import BayesianOptimization

def objective(nn):
    regressor = KNeighborsClassifier(
                      int(nn))
    regressor.fit(X_train, y_train)

    return regressor.score(X_test, y_test)

In [18]:
search_space = {
    "nn": (1, 50)
}

In [19]:
optimizer = BayesianOptimization(
                                f=objective,
                                pbounds=search_space,
                                random_state=123
                              )

In [20]:
optimizer.maximize()

|   iter    |  target   |    nn     |
-------------------------------------
| [0m 1       [0m | [0m 0.7576  [0m | [0m 35.13   [0m |
| [95m 2       [0m | [95m 0.7922  [0m | [95m 15.02   [0m |
| [0m 3       [0m | [0m 0.7749  [0m | [0m 12.12   [0m |
| [0m 4       [0m | [0m 0.7706  [0m | [0m 28.01   [0m |
| [0m 5       [0m | [0m 0.7532  [0m | [0m 36.25   [0m |
| [0m 6       [0m | [0m 0.7619  [0m | [0m 18.15   [0m |
| [95m 7       [0m | [95m 0.7965  [0m | [95m 13.96   [0m |
| [0m 8       [0m | [0m 0.7532  [0m | [0m 49.99   [0m |
| [0m 9       [0m | [0m 0.71    [0m | [0m 1.005   [0m |
| [0m 10      [0m | [0m 0.7662  [0m | [0m 43.48   [0m |
| [0m 11      [0m | [0m 0.7619  [0m | [0m 23.75   [0m |
| [0m 12      [0m | [0m 0.7835  [0m | [0m 31.19   [0m |
| [0m 13      [0m | [0m 0.7835  [0m | [0m 7.393   [0m |
| [0m 14      [0m | [0m 0.7619  [0m | [0m 40.34   [0m |
| [0m 15      [0m | [0m 0.7922  [0m | [0m 9.1

In [21]:
optimizer.max

{'target': 0.7965367965367965, 'params': {'nn': 13.957486648689146}}