# Bayesian Optimization

f(x)는 accuracy or r^2 등

![1_1.png](./materials/1_3.png)

![1_1.png](./materials/1_4.png)

하이퍼파라미터와 그에 따라 fitting된 f(x)가 있으면, 그 f(x)의 형태를 가지고 확률적 추정을 하는 것.<br>
결론적으로는 가우시안 프로세스 방식으로 Surrogate를 만드는 것. 

Acuisition Function이 높은 곳을 바탕으로 next 하이퍼파라미터를 선정. 

![1_1.png](./materials/1_5.png)

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

## 1) Data

In [5]:
# 데이터 불러오기
df_raw = pd.read_csv("./data/HMEQ.csv")
df_raw

Unnamed: 0,BAD,LOAN,MORTDUE,VALUE,REASON,JOB,YOJ,DEROG,DELINQ,CLAGE,NINQ,CLNO,DEBTINC
0,1,1700,30548,40320.0,HomeImp,Other,9.0,0,0.0,101.466002,1.0,8,37.113614
1,1,1800,28502,43034.0,HomeImp,Other,11.0,0,0.0,88.766030,0.0,8,36.884894
2,0,2300,102370,120953.0,HomeImp,Office,2.0,0,0.0,90.992533,0.0,13,31.588503
3,1,2400,34863,47471.0,HomeImp,Mgr,12.0,0,0.0,70.491080,1.0,21,38.263601
4,0,2400,98449,117195.0,HomeImp,Office,4.0,0,0.0,93.811775,0.0,13,29.681827
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3743,0,88900,57264,90185.0,DebtCon,Other,16.0,0,0.0,221.808717,0.0,16,36.112347
3744,0,89000,54576,92937.0,DebtCon,Other,16.0,0,0.0,208.692070,0.0,15,35.859971
3745,0,89200,54045,92924.0,DebtCon,Other,15.0,0,0.0,212.279697,0.0,15,35.556590
3746,0,89800,50370,91861.0,DebtCon,Other,14.0,0,0.0,213.892709,0.0,16,34.340882


In [6]:
df_raw.isnull().sum()

BAD          0
LOAN         0
MORTDUE      0
VALUE       14
REASON       0
JOB         81
YOJ        205
DEROG        0
DELINQ      30
CLAGE       37
NINQ        56
CLNO         0
DEBTINC      0
dtype: int64

In [7]:
df_raw["JOB"].fillna("Other", inplace = True)
df_raw.fillna(df_raw.mean(), inplace=True)

df_raw_dummy = pd.get_dummies(df_raw)

df_raw_dummy 

Unnamed: 0,BAD,LOAN,MORTDUE,VALUE,YOJ,DEROG,DELINQ,CLAGE,NINQ,CLNO,DEBTINC,REASON_DebtCon,REASON_HomeImp,JOB_Mgr,JOB_Office,JOB_Other,JOB_ProfExe,JOB_Sales,JOB_Self
0,1,1700,30548,40320.0,9.0,0,0.0,101.466002,1.0,8,37.113614,0,1,0,0,1,0,0,0
1,1,1800,28502,43034.0,11.0,0,0.0,88.766030,0.0,8,36.884894,0,1,0,0,1,0,0,0
2,0,2300,102370,120953.0,2.0,0,0.0,90.992533,0.0,13,31.588503,0,1,0,1,0,0,0,0
3,1,2400,34863,47471.0,12.0,0,0.0,70.491080,1.0,21,38.263601,0,1,1,0,0,0,0,0
4,0,2400,98449,117195.0,4.0,0,0.0,93.811775,0.0,13,29.681827,0,1,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3743,0,88900,57264,90185.0,16.0,0,0.0,221.808717,0.0,16,36.112347,1,0,0,0,1,0,0,0
3744,0,89000,54576,92937.0,16.0,0,0.0,208.692070,0.0,15,35.859971,1,0,0,0,1,0,0,0
3745,0,89200,54045,92924.0,15.0,0,0.0,212.279697,0.0,15,35.556590,1,0,0,0,1,0,0,0
3746,0,89800,50370,91861.0,14.0,0,0.0,213.892709,0.0,16,34.340882,1,0,0,0,1,0,0,0


In [8]:
df_raw.isnull().sum()

BAD        0
LOAN       0
MORTDUE    0
VALUE      0
REASON     0
JOB        0
YOJ        0
DEROG      0
DELINQ     0
CLAGE      0
NINQ       0
CLNO       0
DEBTINC    0
dtype: int64

In [9]:
df_raw_x = df_raw_dummy.drop("BAD", axis = 1, inplace = False)
df_raw_y = df_raw_dummy["BAD"] 

df_train_x, df_test_x, df_train_y, df_test_y = train_test_split(df_raw_x, df_raw_y, test_size = 0.3, random_state = 1234)

print('df_train_x',df_train_x.shape)
print('df_train_y',df_train_y.shape)
print('df_test_x',df_test_x.shape)
print('df_test_y',df_test_y.shape)

df_train_x (2623, 18)
df_train_y (2623,)
df_test_x (1125, 18)
df_test_y (1125,)


In [10]:
# 변수명 저장
v_feature_names = df_train_x.columns
# StandardScaler 적용
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_raw_x)
df_scaled = pd.DataFrame(df_scaled, columns=v_feature_names)
df_scaled.head()

Unnamed: 0,LOAN,MORTDUE,VALUE,YOJ,DEROG,DELINQ,CLAGE,NINQ,CLNO,DEBTINC,REASON_DebtCon,REASON_HomeImp,JOB_Mgr,JOB_Office,JOB_Other,JOB_ProfExe,JOB_Sales,JOB_Self
0,-1.559587,-0.993626,-1.22491,-0.017563,-0.267462,-0.342558,-0.967342,-0.018776,-1.42835,0.356347,-1.548549,1.548549,-0.381873,-0.446497,1.207355,-0.574064,-0.131804,-0.189556
1,-1.550817,-1.03924,-1.174897,0.251809,-0.267462,-0.342558,-1.122053,-0.666629,-1.42835,0.329025,-1.548549,1.548549,-0.381873,-0.446497,1.207355,-0.574064,-0.131804,-0.189556
2,-1.506968,0.60757,0.260973,-0.960364,-0.267462,-0.342558,-1.09493,-0.666629,-0.905133,-0.303684,-1.548549,1.548549,-0.381873,2.239654,-0.828257,-0.574064,-0.131804,-0.189556
3,-1.498198,-0.897428,-1.093133,0.386495,-0.267462,-0.342558,-1.344679,-0.018776,-0.067985,0.493725,-1.548549,1.548549,2.618672,-0.446497,-0.828257,-0.574064,-0.131804,-0.189556
4,-1.498198,0.520156,0.191721,-0.690992,-0.267462,-0.342558,-1.060586,-0.666629,-0.905133,-0.531456,-1.548549,1.548549,-0.381873,2.239654,-0.828257,-0.574064,-0.131804,-0.189556


In [11]:
# 데이터 분할
df_scaled_train_x, df_scaled_test_x = train_test_split(df_scaled, test_size = 0.3,random_state = 1234)
print("train data X size : {}".format(df_scaled_train_x.shape))
print("test data Y size : {}".format(df_scaled_test_x.shape))

train data X size : (2623, 18)
test data Y size : (1125, 18)


## 2) Bayesian Optimization

[Package Reference](https://github.com/fmfn/BayesianOptimization)

※Library 설치 명령어  
pip install bayesian-optimization  
conda install -c conda-forge bayesian-optimization

In [13]:
def train_acc(C_value, gamma_value):
    
    svc_final = SVC(kernel= 'rbf' ,gamma = C_value , C = gamma_value , random_state=1234)
    svc_final.fit(df_scaled_train_x, df_train_y)
    
   # Validation
    y_pred = svc_final.predict(df_scaled_test_x)
    acc= svc_final.score(df_scaled_test_x, df_test_y)
    
    return acc

In [14]:
from bayes_opt import BayesianOptimization

pbounds = {'C_value': (0.01, 10) , 'gamma_value': (0.01, 10) } # 범위 설정

optimizer = BayesianOptimization(
    f=train_acc ,
    pbounds=pbounds ,
    random_state=1)

optimizer.maximize(init_points=5 , n_iter=25 )  #n_iter 초기 point를 제외하고 시행할 횟수 purples are maximize value.-> we can use final value.

|   iter    |  target   |  C_value  | gamma_... |
-------------------------------------------------
| [0m 1       [0m | [0m 0.9164  [0m | [0m 4.176   [0m | [0m 7.206   [0m |
| [95m 2       [0m | [95m 0.9271  [0m | [95m 0.01114 [0m | [95m 3.03    [0m |
| [0m 3       [0m | [0m 0.9164  [0m | [0m 1.476   [0m | [0m 0.9325  [0m |
| [0m 4       [0m | [0m 0.9191  [0m | [0m 1.871   [0m | [0m 3.462   [0m |
| [0m 5       [0m | [0m 0.9164  [0m | [0m 3.974   [0m | [0m 5.393   [0m |
| [0m 6       [0m | [0m 0.9271  [0m | [0m 0.01    [0m | [0m 3.658   [0m |
| [0m 7       [0m | [0m 0.9076  [0m | [0m 10.0    [0m | [0m 0.01    [0m |
| [95m 8       [0m | [95m 0.9324  [0m | [95m 0.01    [0m | [95m 10.0    [0m |
| [0m 9       [0m | [0m 0.9316  [0m | [0m 0.01    [0m | [0m 7.907   [0m |
| [0m 10      [0m | [0m 0.9129  [0m | [0m 10.0    [0m | [0m 10.0    [0m |
| [0m 11      [0m | [0m 0.9236  [0m | [0m 1.337   [0m | [0m 10.0 

이전보다 더 큰 경우 보라색으려 표현된다. 우리는 맨 마지막 보라색을 쓰는 조합을 보면 된다. 

In [16]:
print(optimizer.max)
for i, res in enumerate(optimizer.res):
    print("Iteration {}: \n\t{}".format(i, res))

{'target': 0.9591111111111111, 'params': {'C_value': 0.1318346971689784, 'gamma_value': 9.308701736904252}}
Iteration 0: 
	{'target': 0.9164444444444444, 'params': {'C_value': 4.176049826978714, 'gamma_value': 7.20604168948716}}
Iteration 1: 
	{'target': 0.9271111111111111, 'params': {'C_value': 0.011142604425275418, 'gamma_value': 3.0303024005920793}}
Iteration 2: 
	{'target': 0.9164444444444444, 'params': {'C_value': 1.4760913492629593, 'gamma_value': 0.9324625617402901}}
Iteration 3: 
	{'target': 0.9191111111111111, 'params': {'C_value': 1.8707395116629324, 'gamma_value': 3.462151663160047}}
Iteration 4: 
	{'target': 0.9164444444444444, 'params': {'C_value': 3.9737070675643924, 'gamma_value': 5.392779172693536}}
Iteration 5: 
	{'target': 0.9271111111111111, 'params': {'C_value': 0.01, 'gamma_value': 3.6578842338729056}}
Iteration 6: 
	{'target': 0.9075555555555556, 'params': {'C_value': 10.0, 'gamma_value': 0.01}}
Iteration 7: 
	{'target': 0.9324444444444444, 'params': {'C_value': 0

## 3) 최종 모델 선택

In [17]:
# 최종 모델
svc_final = SVC(kernel= 'rbf',  gamma = 0.1501,  C = 8.906, random_state=1234)
svc_final.fit(df_scaled_train_x, df_train_y)

# Validation
y_pred = svc_final.predict(df_scaled_test_x)

# train 데이터 셋 정확도
print("Train Accuracy: {:.3f}".format(svc_final.score(df_scaled_train_x, df_train_y)))
# test 데이터 셋 정확도
print("Test Accuracy: {:.3f}\n".format(svc_final.score(df_scaled_test_x, df_test_y)))
# confusion matrix
print("Confusion matrix: \n{}".format(confusion_matrix(df_test_y, y_pred)))

Train Accuracy: 0.993
Test Accuracy: 0.959

Confusion matrix: 
[[1020    1]
 [  45   59]]


https://github.com/fmfn/BayesianOptimization

In [41]:
B = np.array([[2, 3],
             [2, 1]])
B

array([[2, 3],
       [2, 1]])

In [42]:
w, v = np.linalg.eig(B)

In [43]:
w, v

(array([ 4., -1.]),
 array([[ 0.83205029, -0.70710678],
        [ 0.5547002 ,  0.70710678]]))

In [44]:
B = 3*np.array([[2, 3],
             [2, 1]])
B

array([[6, 9],
       [6, 3]])

In [45]:
w, v = np.linalg.eig(B)
w, v

(array([12., -3.]),
 array([[ 0.83205029, -0.70710678],
        [ 0.5547002 ,  0.70710678]]))