# Bayesian Optimization

![1_1.png](./materials/1_3.png)

![1_1.png](./materials/1_4.png)

![1_1.png](./materials/1_5.png)

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

## 1) Data

* HMEQ Data : 대출 심사 모델을 위한 데이터셋
* Samples : 3,748 개
* Columns : 13 개  
 -BAD : 신용상태 Good(0), BAD(1)  
 -LOAN: 대출금액  
 -MORTDUE : 저당금액  
 -VALUE : 현재의 자산가치
 -REASON : 대출사유  
 -JOB : 대출자 직업   
 -YOJ : 직장 근속연수  
 -DEROG : 주요 부실거래 건수  
 -DELINQ : 신용불량자 횟 수  
 -CLAGE : 가장 오래된 거래선 사용기간  
 -NINQ : 최근 신용상태 조회수  
 -CLNO : 금융거래 수  
 -DEBTINC : 수입대비 부채비율  

In [4]:
# 데이터 불러오기
df_raw = pd.read_csv("./HMEQ.csv")
df_raw

Unnamed: 0,BAD,LOAN,MORTDUE,VALUE,REASON,JOB,YOJ,DEROG,DELINQ,CLAGE,NINQ,CLNO,DEBTINC
0,1,1700,30548,40320.0,HomeImp,Other,9.0,0,0.0,101.466002,1.0,8,37.113614
1,1,1800,28502,43034.0,HomeImp,Other,11.0,0,0.0,88.766030,0.0,8,36.884894
2,0,2300,102370,120953.0,HomeImp,Office,2.0,0,0.0,90.992533,0.0,13,31.588503
3,1,2400,34863,47471.0,HomeImp,Mgr,12.0,0,0.0,70.491080,1.0,21,38.263601
4,0,2400,98449,117195.0,HomeImp,Office,4.0,0,0.0,93.811775,0.0,13,29.681827
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3743,0,88900,57264,90185.0,DebtCon,Other,16.0,0,0.0,221.808718,0.0,16,36.112347
3744,0,89000,54576,92937.0,DebtCon,Other,16.0,0,0.0,208.692070,0.0,15,35.859971
3745,0,89200,54045,92924.0,DebtCon,Other,15.0,0,0.0,212.279697,0.0,15,35.556590
3746,0,89800,50370,91861.0,DebtCon,Other,14.0,0,0.0,213.892709,0.0,16,34.340882


In [5]:
df_raw.isnull().sum()

BAD          0
LOAN         0
MORTDUE      0
VALUE       14
REASON       0
JOB         81
YOJ        205
DEROG        0
DELINQ      30
CLAGE       37
NINQ        56
CLNO         0
DEBTINC      0
dtype: int64

In [6]:
df_raw["JOB"].fillna("Other", inplace = True)
df_raw.fillna(df_raw.mean(), inplace=True)

df_raw_dummy = pd.get_dummies(df_raw)

df_raw_dummy 

Unnamed: 0,BAD,LOAN,MORTDUE,VALUE,YOJ,DEROG,DELINQ,CLAGE,NINQ,CLNO,DEBTINC,REASON_DebtCon,REASON_HomeImp,JOB_Mgr,JOB_Office,JOB_Other,JOB_ProfExe,JOB_Sales,JOB_Self
0,1,1700,30548,40320.0,9.0,0,0.0,101.466002,1.0,8,37.113614,0,1,0,0,1,0,0,0
1,1,1800,28502,43034.0,11.0,0,0.0,88.766030,0.0,8,36.884894,0,1,0,0,1,0,0,0
2,0,2300,102370,120953.0,2.0,0,0.0,90.992533,0.0,13,31.588503,0,1,0,1,0,0,0,0
3,1,2400,34863,47471.0,12.0,0,0.0,70.491080,1.0,21,38.263601,0,1,1,0,0,0,0,0
4,0,2400,98449,117195.0,4.0,0,0.0,93.811775,0.0,13,29.681827,0,1,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3743,0,88900,57264,90185.0,16.0,0,0.0,221.808718,0.0,16,36.112347,1,0,0,0,1,0,0,0
3744,0,89000,54576,92937.0,16.0,0,0.0,208.692070,0.0,15,35.859971,1,0,0,0,1,0,0,0
3745,0,89200,54045,92924.0,15.0,0,0.0,212.279697,0.0,15,35.556590,1,0,0,0,1,0,0,0
3746,0,89800,50370,91861.0,14.0,0,0.0,213.892709,0.0,16,34.340882,1,0,0,0,1,0,0,0


In [12]:
df_raw.isnull().sum()


BAD        0
LOAN       0
MORTDUE    0
VALUE      0
REASON     0
JOB        0
YOJ        0
DEROG      0
DELINQ     0
CLAGE      0
NINQ       0
CLNO       0
DEBTINC    0
dtype: int64

In [13]:
df_raw_x = df_raw_dummy.drop("BAD", axis = 1, inplace = False)
df_raw_y = df_raw_dummy["BAD"] 

df_train_x, df_test_x, df_train_y, df_test_y = train_test_split(df_raw_x, df_raw_y, test_size = 0.3, random_state = 1234)

print('df_train_x',df_train_x.shape)
print('df_train_y',df_train_y.shape)
print('df_test_x',df_test_x.shape)
print('df_test_y',df_test_y.shape)

df_train_x (2623, 18)
df_train_y (2623,)
df_test_x (1125, 18)
df_test_y (1125,)


In [14]:
# 변수명 저장
v_feature_names = df_train_x.columns
# StandardScaler 적용
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_raw_x)
df_scaled = pd.DataFrame(df_scaled, columns=v_feature_names)
df_scaled.head()

Unnamed: 0,LOAN,MORTDUE,VALUE,YOJ,DEROG,DELINQ,CLAGE,NINQ,CLNO,DEBTINC,REASON_DebtCon,REASON_HomeImp,JOB_Mgr,JOB_Office,JOB_Other,JOB_ProfExe,JOB_Sales,JOB_Self
0,-1.559587,-0.993626,-1.22491,-0.017563,-0.267462,-0.342558,-0.967342,-0.018776,-1.42835,0.356347,-1.548549,1.548549,-0.381873,-0.446497,1.207355,-0.574064,-0.131804,-0.189556
1,-1.550817,-1.03924,-1.174897,0.251809,-0.267462,-0.342558,-1.122053,-0.666629,-1.42835,0.329025,-1.548549,1.548549,-0.381873,-0.446497,1.207355,-0.574064,-0.131804,-0.189556
2,-1.506968,0.60757,0.260973,-0.960364,-0.267462,-0.342558,-1.09493,-0.666629,-0.905133,-0.303684,-1.548549,1.548549,-0.381873,2.239654,-0.828257,-0.574064,-0.131804,-0.189556
3,-1.498198,-0.897428,-1.093133,0.386495,-0.267462,-0.342558,-1.344679,-0.018776,-0.067985,0.493725,-1.548549,1.548549,2.618672,-0.446497,-0.828257,-0.574064,-0.131804,-0.189556
4,-1.498198,0.520156,0.191721,-0.690992,-0.267462,-0.342558,-1.060586,-0.666629,-0.905133,-0.531456,-1.548549,1.548549,-0.381873,2.239654,-0.828257,-0.574064,-0.131804,-0.189556


In [16]:
# 데이터 분할
df_scaled_train_x, df_scaled_test_x = train_test_split(df_scaled, test_size = 0.3,random_state = 1234)
print("train data X size : {}".format(df_scaled_train_x.shape))
print("test data Y size : {}".format(df_scaled_test_x.shape))

train data X size : (2623, 18)
test data Y size : (1125, 18)


## 2) Bayesian Optimization

※Library 설치 명령어  
pip install bayesian-optimization  
conda install -c conda-forge bayesian-optimization

In [9]:
def train_acc(C_value, gamma_value):
    
    svc_final = SVC(kernel= 'rbf' ,gamma = gamma_value , C = C_value , random_state=1234)
    svc_final.fit(df_scaled_train_x, df_train_y)
    
   # Validation
    y_pred = svc_final.predict(df_scaled_test_x)
    acc= svc_final.score(df_scaled_test_x, df_test_y)
    
    return acc

In [39]:
from bayes_opt import BayesianOptimization

pbounds = {'C_value': (0.01, 10) , 'gamma_value': (0.01, 10) } # 범위 설정

optimizer = BayesianOptimization(
    f=train_acc ,
    pbounds=pbounds ,
    random_state=1)

optimizer.maximize(init_points=5 , n_iter= 20 )  #n_iter 초기 point를 제외하고 시행할 횟수 purples are maximize value.-> we can use final value.



|   iter    |  target   |  C_value  | gamma_... |
-------------------------------------------------
| [0m 1       [0m | [0m 0.9147  [0m | [0m 4.176   [0m | [0m 7.206   [0m |
| [0m 2       [0m | [0m 0.9076  [0m | [0m 0.01114 [0m | [0m 3.03    [0m |
| [95m 3       [0m | [95m 0.9227  [0m | [95m 1.476   [0m | [95m 0.9325  [0m |
| [0m 4       [0m | [0m 0.9164  [0m | [0m 1.871   [0m | [0m 3.462   [0m |
| [0m 5       [0m | [0m 0.9147  [0m | [0m 3.974   [0m | [0m 5.393   [0m |
| [0m 6       [0m | [0m 0.9147  [0m | [0m 1.097   [0m | [0m 3.695   [0m |
| [95m 7       [0m | [95m 0.9307  [0m | [95m 2.175   [0m | [95m 0.5718  [0m |
| [0m 8       [0m | [0m 0.9244  [0m | [0m 3.138   [0m | [0m 0.01    [0m |
| [0m 9       [0m | [0m 0.9227  [0m | [0m 2.866   [0m | [0m 1.28    [0m |
| [95m 10      [0m | [95m 0.9342  [0m | [95m 2.051   [0m | [95m 0.03491 [0m |
| [95m 11      [0m | [95m 0.9422  [0m | [95m 10.0    [0m | [9

In [40]:
print(optimizer.max)
for i, res in enumerate(optimizer.res):
    print("Iteration {}: \n\t{}".format(i, res))

{'target': 0.9591111111111111, 'params': {'C_value': 8.905647043146002, 'gamma_value': 0.15014249686739925}}
Iteration 0: 
	{'target': 0.9146666666666666, 'params': {'C_value': 4.176049826978714, 'gamma_value': 7.20604168948716}}
Iteration 1: 
	{'target': 0.9075555555555556, 'params': {'C_value': 0.011142604425275418, 'gamma_value': 3.0303024005920793}}
Iteration 2: 
	{'target': 0.9226666666666666, 'params': {'C_value': 1.4760913492629593, 'gamma_value': 0.9324625617402901}}
Iteration 3: 
	{'target': 0.9164444444444444, 'params': {'C_value': 1.8707395116629324, 'gamma_value': 3.462151663160047}}
Iteration 4: 
	{'target': 0.9146666666666666, 'params': {'C_value': 3.9737070675643924, 'gamma_value': 5.392779172693536}}
Iteration 5: 
	{'target': 0.9146666666666666, 'params': {'C_value': 1.0966753844799642, 'gamma_value': 3.6951029080346163}}
Iteration 6: 
	{'target': 0.9306666666666666, 'params': {'C_value': 2.1752548332322, 'gamma_value': 0.5717789996674969}}
Iteration 7: 
	{'target': 0.9

## 3) 최종 모델 선택

In [17]:
# 최종 모델
svc_final = SVC(kernel= 'rbf',  gamma = 0.1501 ,  C = 8.906, random_state=1234)
svc_final.fit(df_scaled_train_x, df_train_y)

# Validation
y_pred = svc_final.predict(df_scaled_test_x)

# train 데이터 셋 정확도
print("Train Accuracy: {:.3f}".format(svc_final.score(df_scaled_train_x, df_train_y)))
# test 데이터 셋 정확도
print("Test Accuracy: {:.3f}\n".format(svc_final.score(df_scaled_test_x, df_test_y)))
# confusion matrix
print("Confusion matrix: \n{}".format(confusion_matrix(df_test_y, y_pred)))

Train Accuracy: 0.993
Test Accuracy: 0.959

Confusion matrix: 
[[1020    1]
 [  45   59]]


https://github.com/fmfn/BayesianOptimization