# 서포트벡터머신 개념 정리

**서포트벡터머신** : 머신러닝 기법의 하나로 패턴인식,이진분류를 위한 비확률적 지도학습 모델
    (주로 회귀와 분류문제 해결에 사용)
- 장점
    - 회귀와 분류에 모두 적용할 수 있음.
    - 라벨을 직접 추정하기때문에 조건부확률로 예측하는 모형에 비해 예측력이 높음.
    - 과적합의 가능성이 낮음.

- 단점
    - 데이터 스케일링에 민감함.
    - 고차원으로 갈수록 여러 연산이 필요해서 학습속도가 느림.
    - 다양한 하이퍼파라미터(커널,C,gamma)값을 여러 번 조합해 최적의 모델을 구해야함.
    - 모형이 복잡해 결과에 대한 설명이 어려움.
    

**중요 하이퍼 파라미터**
* Kernel : 주어진 데이터를 공간 상에서 분류하는 함수를 지정(linear,poly,RBF가 있음, 고차원 데이터는 선형분류로는 예측력이 매우 낮음)
* C : 서포트벡터와 결정경계 사이의 마진(거리)을 설정(C가 클수록 마진이 좁아짐,하드마진)
* gamma : 비선형 커널에서 결정경계의 곡률을 설정(높을수록 곡률이 큼,오차를 적게 허용)

C와 gamma를 높게 설정할수록 오류를 덜 허용하여 과적합될 수도 있음

## 전처리

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder # for encoding
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler #for standardization
from sklearn.model_selection import train_test_split
import time

In [None]:
def preprocessing(df_scaling, scaled_form = 'MinMaxScaler()'):
    # 불필요한 컬럼 제거
    if 'ID' in df_scaling:
        df_scaling = df_scaling.drop("ID", axis = 1)
        if len(df_scaling.columns) == 1:
            return df_scaling
        
        df_scaling = df_scaling.drop('oral', axis = 1) 
        # 범주형 피처 레이블 인코딩 
        cate_features = df_scaling[['gender','tartar']]

        lbe = LabelEncoder()
        lbe.fit_transform(df_scaling["gender"])
        df_scaling["gender"] = lbe.fit_transform(df_scaling["gender"])

        lbe = LabelEncoder()
        lbe.fit_transform(df_scaling["tartar"])
        df_scaling["tartar"] = lbe.fit_transform(df_scaling["tartar"])
        # hearing 피처 1, 2 => 1, 0으로 변환
        df_scaling['hearing(left)'] = df_scaling['hearing(left)'].apply(lambda x: x-2 if x ==2.0 else x )
        df_scaling['hearing(right)'] = df_scaling['hearing(right)'].apply(lambda x: x-2 if x ==2.0 else x )
        
        # BMI 지수 계산
        # bmi = kg/m^2
        #df_scaling['bmi'] = df_scaling['weight(kg)']/((df_scaling['height(cm)']*0.01)**2)
        # wwi(비만 지수) 지수 계산
        #df_scaling['wwi'] = df_scaling['waist(cm)']/(df_scaling['weight(kg)'].apply(np.sqrt))
        return df_scaling

In [None]:
def scaling(train_data, test_data, scaled_form = 'MinMaxScaler()'):
    # 형태 별 특성 구분
    train_data = preprocessing(train_data)
    test_data = preprocessing(test_data)
    tr_cate_features = train_data[['gender', 'tartar', 'hearing(right)', 'hearing(left)', 'dental caries']]
    tr_scaled_features = train_data.drop(tr_cate_features.columns, axis=1)
    
    ts_cate_features = test_data[['gender', 'tartar', 'hearing(right)', 'hearing(left)', 'dental caries']]
    ts_scaled_features = test_data.drop(ts_cate_features.columns, axis=1)
    
    if scaled_form == 'StandardScaler()':
        # Standard scaler
        scaler = StandardScaler()
        scaler.fit(tr_scaled_features) # 훈련 데이터에 fit() 적용
        
        # 훈련 데이터와 테스트 데이터에 transform()을 통해 변환
        tr_scaled = scaler.transform(tr_scaled_features)
        ts_scaled = scaler.transform(ts_scaled_features)
        
        train_std_scaled = pd.DataFrame(tr_scaled, columns=tr_scaled_features.columns)
        train_std_scaled[tr_cate_features.columns] = tr_cate_features
        
        test_std_scaled = pd.DataFrame(ts_scaled, columns=ts_scaled_features.columns)
        test_std_scaled[ts_cate_features.columns] = ts_cate_features
        
        return train_std_scaled, test_std_scaled
    
    elif scaled_form == 'RobustScaler()':
        # Robust scaler
        scaler = RobustScaler()
        scaler.fit(tr_scaled_features) # 훈련 데이터에 fit() 적용
        
        # 훈련 데이터와 테스트 데이터에 transform()을 통해 변환
        tr_scaled = scaler.transform(tr_scaled_features)
        ts_scaled = scaler.transform(ts_scaled_features)
        
        # 데이터 프레임 형태로 변환
        train_robust_scaled = pd.DataFrame(tr_scaled, columns=tr_scaled_features.columns)
        train_robust_scaled[tr_cate_features.columns] = tr_cate_features
        
        test_robust_scaled = pd.DataFrame(ts_scaled, columns=ts_scaled_features.columns)
        test_robust_scaled[ts_cate_features.columns] = ts_cate_features
        
        return train_robust_scaled, test_robust_scaled
        
    else:
        # MinMax scaler
        scaler = MinMaxScaler()
        scaler.fit(tr_scaled_features) # 훈련 데이터에 fit() 적용
        
        # 훈련 데이터와 테스트 데이터에 transform()을 통해 변환
        tr_scaled = scaler.transform(tr_scaled_features)
        ts_scaled = scaler.transform(ts_scaled_features)
        
        # 데이터 프레임 형태로 변환
        train_mmx_scaled = pd.DataFrame(tr_scaled, columns = tr_scaled_features.columns)
        train_mmx_scaled[tr_cate_features.columns] = tr_cate_features
        
        test_mmx_scaled = pd.DataFrame(ts_scaled,columns = ts_scaled_features.columns)
        test_mmx_scaled[ts_cate_features.columns] = ts_cate_features

        return train_mmx_scaled, test_mmx_scaled

In [None]:
# competition_format
x_train = pd.read_csv('./data/Smoking_raw/competition_format/x_train.csv')
x_test = pd.read_csv('./data/Smoking_raw/competition_format/x_test.csv')
y_train = pd.read_csv('./data/Smoking_raw/competition_format/y_train.csv')
y_test = pd.read_csv('./data/Smoking_raw/competition_format/y_test.csv')

In [None]:
scaled = scaling(x_train, x_test,scaled_form='StandardScaler()')

In [None]:
x_train_s = pd.DataFrame(scaled[0])
x_test_s = pd.DataFrame(scaled[1])
y_train_s = preprocessing(y_train)
y_test_s = preprocessing(y_test)

In [None]:
x_train_s.head()

In [None]:
x_train_s.info()

In [None]:
x_test_s.head()

## 모델링

### Standard Scaler

In [None]:
from sklearn.metrics import accuracy_score,f1_score, confusion_matrix, precision_recall_curve
from sklearn.metrics import roc_curve,precision_score, recall_score, roc_auc_score
from sklearn.svm import SVC


### 선형 커널(linear)

C만 설정

In [None]:
model1_s=SVC(kernel='linear',C=1.0).fit(x_train_s,y_train_s)
score1=model1_s.score (x_train_s, y_train_s)
score1_t=model1_s.score (x_test_s, y_test_s)
print('model 훈련 점수 : {: .3f}'.format(score1))
print('model 테스트 점수 : {: .3f}'.format(score1_t))

In [None]:
# C=0.1
model2_s=SVC(kernel='linear',C=0.1).fit(x_train_s,y_train_s)
score2=model2_s.score (x_train_s, y_train_s)
score2_t=model2_s.score (x_test_s, y_test_s)
print('model 훈련 점수 : {: .3f}'.format(score2))
print('model 테스트 점수 : {: .3f}'.format(score2_t))

### RBF 커널(RBF)

C와 gamma 설정

In [None]:
model3_s=SVC(kernel='rbf',C=5,gamma=3).fit(x_train_s,y_train_s)
score3=model3_s.score (x_train_s, y_train_s)
score3_t=model3_s.score (x_test_s, y_test_s)
print('model 훈련 점수 : {: .3f}'.format(score3))
print('model 테스트 점수 : {: .3f}'.format(score3_t))

In [None]:
# rbf
# c=1 gamma=3
model5_s=SVC(kernel='rbf',C=1.0,gamma=3).fit(x_train_s,y_train_s)
score5=model5_s.score (x_train_s, y_train_s)
score5_t=model5_s.score (x_test_s, y_test_s)

print('model 훈련 점수 : {: .3f}'.format(score5))
print('model 테스트 점수 : {: .3f}'.format(score5_t))

## 성능비교

### 하드마진

### 비교적 성능(연산속도 and 정확도) 좋음(C=1,gamma=3)

In [None]:
start = time.time()
model5_s=SVC(kernel='rbf',C=1.0,gamma=3).fit(x_train_s,y_train_s)
print("time :", time.time() - start)

In [None]:
start = time.time()
print('model 훈련 점수 : {: .3f}'.format(model5_s.score (x_train_s, y_train_s)))
print('model 테스트 점수 : {: .3f}'.format(model5_s.score (x_test_s, y_test_s)))
print("time :", time.time() - start)

### 성능(연산속도 or 정확도) 낮음

In [None]:
start = time.time()
model3_s=SVC(kernel='rbf',C=5,gamma=3).fit(x_train_s,y_train_s)
print("time :", time.time() - start)

In [None]:
start = time.time()
print('model 훈련 점수 : {: .3f}'.format(model3_s.score (x_train_s, y_train_s)))
print('model 테스트 점수 : {: .3f}'.format(model3_s.score (x_test_s, y_test_s)))
print("time :", time.time() - start)

### 소프트마진

In [None]:
start = time.time()
model4_s=SVC(kernel='rbf',C=0.5,gamma=1).fit(x_train_s,y_train_s)
print("time :", time.time() - start)

In [None]:
start = time.time()
print('model 훈련 점수 : {: .3f}'.format(model4_s.score (x_train_s, y_train_s)))
print('model 테스트 점수 : {: .3f}'.format(model4_s.score (x_test_s, y_test_s)))
print("time :", time.time() - start)

# 모델 평가(모든 피처 포함)

하이퍼파라미터: kernel=rbf, C=1, gamma=3

In [None]:
model5_s

In [None]:
import seaborn as sns

In [None]:
cm_t = pd.DataFrame(data=confusion_m, columns=['Actual Positive:1', 'Actual Negative:0'], 
                                 index=['Predict Positive:1', 'Predict Negative:0'])
cm_t

In [None]:
sns.heatmap(cm_t, annot=True, fmt='d', cmap='BuGn_r')

In [None]:
score=[acc,pre,rec,f1,auc]

In [None]:
pd.DataFrame(score,index=['정확도','정밀도','재현율','F1_score','ROC_auc_score'],columns=['summary'])

# 피처선택 후 모델링

In [None]:
# 기본 모델에서 가장 성능이 좋았던 하이퍼 파라미터
svm=SVC(kernel='rbf',C=1.0,gamma=3)

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

In [None]:
# accuracy_list_train = []
# k=np.arange(1,24,1)
# for each in k:
#     # 1~23개의 피처를 이용해  피처 개수마다 서포트벡터머신 적용
#     x_new = SelectKBest(f_classif, k=each).fit_transform(x_train_s, y_train_s)
#     svm.fit(x_new,y_train_s)
#     # 1~23개 이용할 때 마다 계산된 정확도를 저장
#     accuracy_list_train.append(svm.score(x_new,y_train_s))   

## 피처데이터 개수에 따른 정확도

In [None]:
#select=pd.DataFrame({'features':k,'train accuracy':accuracy_list_train})

In [None]:
#select.to_csv('./select.csv')

In [None]:
select_f=pd.read_csv('./select.csv',index_col=0)

### train data에서 feature 9개만으로 정확도가 90%를 넘고, 17개에서 100%

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
select_f

In [None]:
# 베스트 피처개수에 따른 정확도 그래프
plt.figure(figsize=(10,8))
plt.plot(select_f['features'],select_f['train accuracy'])
plt.xlabel("features")
plt.ylabel("train accuracy")
plt.legend(['train'])
plt.show()

## 변수선택 함수

In [None]:
def sel_f(x_train,y_train,k=None):
    sel = SelectKBest(f_classif, k = k )
    x_new = sel.fit_transform(x_train, y_train)
    #x_new_t=sel.fit_transform(x_test_s,y_test_s)
    feat_tr = x_train.columns.values[sel.get_support()]
    #feat_te = x_test_s.columns.values[sel.get_support()]
    #print("x train features:",feat_tr)
    #print("x test features:",feat_te)
    return feat_tr


In [None]:
f9=sel_f(x_train_s,y_train_s,k=9)

## 모델링 및 정확도 계산 함수

In [None]:
def modeling(x_tr,y_tr,x_te,y_te,C=None,gamma=None):
    start=time.time()
    model=SVC(kernel='rbf',C=C,gamma=gamma).fit(x_tr,y_tr)
    print(''time.time()-start)
    start2=time.time()
    score_tr=model.score(x_tr, y_tr)
    score_te=model.score(x_te, y_te)
    print('train accuracy : {: .3f}'.format(score_tr))
    print('test accuracy : {: .3f}'.format(score_te))
    print(time.time()-start2)
    return model,score_tr,score_te

## 정확도 계산 예시

### k=16

In [None]:
f16=sel_f(x_train_s,y_train_s,k=16)

In [None]:
f16

In [None]:
sel_x_tr=x_train_s[f16]
sel_x_te=x_test_s[f16]

In [None]:
sel_x_tr.head()

In [None]:
sel_x_te.head()

In [None]:
model,score_tr,score_te=modeling(sel_x_tr,y_train_s,sel_x_te,y_test_s,C=1,gamma=3)

In [None]:
print(model,score_tr,score_te)

In [None]:
print('{:0.3f}'.format(score_tr))

In [None]:
print('{}'.format(score_tr))

## 평가지표

In [None]:
start = time.time()

pred_t=model.predict(sel_x_te)
confusion_m=confusion_matrix(y_test_s, pred_t)

print("time :", time.time() - start)

In [None]:
import seaborn as sns

In [None]:
cm_t = pd.DataFrame(data=confusion_m, columns=['Actual Positive:1', 'Actual Negative:0'], 
                                 index=['Predict Positive:1', 'Predict Negative:0'])
cm_t

In [None]:
sns.heatmap(cm_t, annot=True, fmt='d', cmap='BuGn_r')

### 정밀도 재현율 f1score

In [None]:
acc=accuracy_score(y_test_s,pred_t)
pre=precision_score(y_test_s , pred_t)
rec=recall_score(y_test_s , pred_t)
f1=f1_score(y_test_s,pred_t)
auc=roc_auc_score(y_test_s, pred_t)

In [None]:
print('정확도 : {:.4f}'.format(acc))
# 정밀도
print('정밀도 : {:.4f}'.format(pre))
# 재현율
print('재현율 : {:.4f}'.format(rec))
# f1_score
print('F1_score : {:.4f}'.format(f1))
# AUC score 
print('ROC_AUC_score : {:.4f}'.format(auc))

In [None]:
import joblib

In [None]:
joblib.dump(model, './model_k16.pkl')

In [None]:
dir(model)

In [None]:
model.feature_names_in_

### k=17

In [None]:
f17=sel_f(x_train_s,y_train_s,k=17)

In [None]:
f17

In [None]:
sel_x_tr=x_train_s[f17]
sel_x_te=x_test_s[f17]

In [None]:
sel_x_tr.head()

In [None]:
sel_x_tr.shape

In [None]:
sel_x_te.head()

In [None]:
sel_x_te.shape

In [None]:
model2,score_tr2,score_te2=modeling(sel_x_tr,y_train_s,sel_x_te,y_test_s,C=1,gamma=3)

In [None]:
print(model2,score_tr2,score_te2)

In [None]:
model2.feature_names_in_

## 평가지표

In [None]:
start = time.time()

pred_t2=model2.predict(sel_x_te)
confusion_m2=confusion_matrix(y_test_s, pred_t2)

print("time :", time.time() - start)

In [None]:
import seaborn as sns

In [None]:
cm_t2 = pd.DataFrame(data=confusion_m2, columns=['Actual Positive:1', 'Actual Negative:0'], 
                                 index=['Predict Positive:1', 'Predict Negative:0'])
cm_t2

In [None]:
sns.heatmap(cm_t2, annot=True, fmt='d', cmap='BuGn_r')

### 정밀도 재현율 f1score

In [None]:
acc2=accuracy_score(y_test_s,pred_t2)
pre2=precision_score(y_test_s , pred_t2)
rec2=recall_score(y_test_s , pred_t2)
f12=f1_score(y_test_s,pred_t2)
auc2=roc_auc_score(y_test_s, pred_t2)

In [None]:
print('정확도 : {:.4f}'.format(acc2))
# 정밀도
print('정밀도 : {:.4f}'.format(pre2))
# 재현율
print('재현율 : {:.4f}'.format(rec2))
# f1_score
print('F1_score : {:.4f}'.format(f12))
# AUC score 
print('ROC_AUC_score : {:.4f}'.format(auc2))

In [None]:
joblib.dump(model2, './model_k17.pkl')

### k=9

In [None]:
sel_x_tr=x_train_s[f9]
sel_x_te=x_test_s[f9]


In [None]:
sel_x_tr.head()

In [None]:
sel_x_te.head()

In [None]:
modeling(sel_x_tr,y_train_s,sel_x_te,y_test_s,C=1,gamma=3)

## gender 제거 후 모델링

In [None]:
x_train_g=x_train_s.drop('gender',axis=1)

In [None]:
x_train_g

In [None]:
x_test_g=x_test_s.drop('gender',axis=1)

In [None]:
x_test_g

In [None]:
model_ex=SVC(kernel='rbf',C=1,gamma=3).fit(x_train_g,y_train_s)
score_tr_g=model_ex.score (x_train_g, y_train_s)
score_te_g=model_ex.score (x_test_g, y_test_s)
print('model 훈련 점수 : {: .3f}'.format(score_tr_g))
print('model 테스트 점수 : {: .3f}'.format(score_te_g))

In [None]:
model_ex.feature_names_in_

In [None]:
score_tr_g

In [None]:
score_te_g