## Support Vector Machine
- 데이터 크기가 중간 크기 이하 여러 변수를 기준으로 분류하는 다소 복잡한 과제에 적합한 ML 기법 
- 레이블 범주를 선형적으로 구분하거나 비선형적으ㅗㄹ 분류하는 선 혹은 초평면을 찾는 것이 핵심

### Classification

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd 
data = pd.read_csv("../Data/breast-cancer-wisconsin.csv", encoding="utf-8")
X = data.iloc[:, 1:10]
y = data[["Class"]]

In [2]:
from sklearn.model_selection import * 
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=410)

from sklearn.preprocessing import * 
minmax = MinMaxScaler()
minmax.fit(X_train)
X_scaled_train = minmax.transform(X_train)
X_scaled_test = minmax.transform(X_test)

In [3]:
import sklearn
help(sklearn)

Help on package sklearn:

NAME
    sklearn

DESCRIPTION
    Machine learning module for Python
    
    sklearn is a Python module integrating classical machine
    learning algorithms in the tightly-knit world of scientific Python
    packages (numpy, scipy, matplotlib).
    
    It aims to provide simple and efficient solutions to learning problems
    that are accessible to everybody and reusable in various contexts:
    machine-learning as a versatile tool for science and engineering.
    
    See http://scikit-learn.org for complete documentation.

PACKAGE CONTENTS
    __check_build (package)
    _build_utils (package)
    _config
    _distributor_init
    _isotonic
    _loss (package)
    _min_dependencies
    base
    calibration
    cluster (package)
    compose (package)
    conftest
    covariance (package)
    cross_decomposition (package)
    datasets (package)
    decomposition (package)
    discriminant_analysis
    dummy
    ensemble (package)
    exceptions
    experime

In [4]:
from sklearn.svm import *
dir(sklearn.svm)

['LinearSVC',
 'LinearSVR',
 'NuSVC',
 'NuSVR',
 'OneClassSVM',
 'SVC',
 'SVR',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '_base',
 '_bounds',
 '_classes',
 '_liblinear',
 '_libsvm',
 '_libsvm_sparse',
 'l1_min_c']

In [5]:
model = SVC() 
model.fit(X_scaled_train, y_train)
pred_train = model.predict(X_scaled_train)
model.score(X_scaled_train, y_train)

0.9765625

In [6]:
pred_test = model.predict(X_scaled_test)
model.score(X_scaled_test, y_test)

0.9707602339181286

In [7]:
from sklearn.metrics import * 
con_train = confusion_matrix(y_train, pred_train)
print(con_train)

[[325   8]
 [  4 175]]


In [8]:
report_train = classification_report(y_train, pred_train)
print(report_train)

              precision    recall  f1-score   support

           0       0.99      0.98      0.98       333
           1       0.96      0.98      0.97       179

    accuracy                           0.98       512
   macro avg       0.97      0.98      0.97       512
weighted avg       0.98      0.98      0.98       512



In [9]:
con_test = confusion_matrix(y_test, pred_test)
print(con_test)

[[107   4]
 [  1  59]]


In [10]:
report_test = classification_report(y_test, pred_test)
print(report_test)

              precision    recall  f1-score   support

           0       0.99      0.96      0.98       111
           1       0.94      0.98      0.96        60

    accuracy                           0.97       171
   macro avg       0.96      0.97      0.97       171
weighted avg       0.97      0.97      0.97       171



In [11]:
help(SVC)

Help on class SVC in module sklearn.svm._classes:

class SVC(sklearn.svm._base.BaseSVC)
 |  SVC(*, C=1.0, kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape='ovr', break_ties=False, random_state=None)
 |  
 |  C-Support Vector Classification.
 |  
 |  The implementation is based on libsvm. The fit time scales at least
 |  quadratically with the number of samples and may be impractical
 |  beyond tens of thousands of samples. For large datasets
 |  consider using :class:`~sklearn.svm.LinearSVC` or
 |  :class:`~sklearn.linear_model.SGDClassifier` instead, possibly after a
 |  :class:`~sklearn.kernel_approximation.Nystroem` transformer.
 |  
 |  The multiclass support is handled according to a one-vs-one scheme.
 |  
 |  For details on the precise mathematical formulation of the provided
 |  kernel functions and how `gamma`, `coef0` and `degree` affect each
 

In [12]:
gridparam = [{"kernel" : ['rbf'], "C" : [0.001, 0.01, 0.1, 1, 10, 100], 
               "gamma" : [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]},
              {"kernel" : ['linear'], "C" : [0.001, 0.01, 0.1, 1, 10, 100], 
               "gamma" : [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]}]
gridSearch = GridSearchCV(SVC(), gridparam, cv = 5)
gridSearch.fit(X_scaled_train, y_train)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid=[{'C': [0.001, 0.01, 0.1, 1, 10, 100],
                          'gamma': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
                          'kernel': ['rbf']},
                         {'C': [0.001, 0.01, 0.1, 1, 10, 100],
                          'gamma': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
                          'kernel': ['linear']}])

In [13]:
print("Best param : {}".format(gridSearch.best_params_))
print("Best Score : {}".format(gridSearch.best_score_))
print("Test score : {}".format(gridSearch.score(X_scaled_test, y_test)))

Best param : {'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}
Best Score : 0.9726442033123929
Test score : 0.9824561403508771


In [14]:
from scipy.stats import randint
paramDist = {'kernel' : ['rbf'], "C" : randint(low = 0.001, high = 100),
"gamma" : randint(low = 0.0001, high=100)}

In [15]:
randSearch = RandomizedSearchCV(SVC(), paramDist, n_iter=100, cv = 5)
randSearch.fit(X_scaled_train, y_train)

RandomizedSearchCV(cv=5, estimator=SVC(), n_iter=100,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x160181d90>,
                                        'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1601d3ee0>,
                                        'kernel': ['rbf']})

In [16]:
print("Best Param : {}".format(randSearch.best_params_))
print("Best score : {}".format(randSearch.best_score_))
print("Test score : {}".format(randSearch.score(X_scaled_test, y_test)))

Best Param : {'C': 76, 'gamma': 6, 'kernel': 'rbf'}
Best score : 0.9668189605939463
Test score : 0.9649122807017544


### Regression

In [17]:
import pandas as pd 
data2 = pd.read_csv("../Data/house_price.csv", encoding="utf-8")
X = data2.iloc[:, 1:5]
y = data2[['house_value']]

from sklearn.model_selection import * 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=410)

from sklearn.preprocessing import * 
minmax = MinMaxScaler()
minmax.fit(X_train)
X_scaled_train = minmax.transform(X_train)
X_scaled_test = minmax.transform(X_test)

In [18]:
from sklearn.svm import * 
model = SVR(kernel="poly")
# kernel 종류는 linear, poly, rbf, sigmoid, precomputed가 존재
model.fit(X_scaled_train, y_train)
pred_train = model.predict(X_scaled_train)
model.score(X_scaled_train, y_train)

0.4544889128338093

In [19]:
pred_test = model.predict(X_scaled_test)
model.score(X_scaled_test, y_test)

0.45671151623706674

In [20]:
import numpy as np 
from sklearn.metrics import * 
MSE_train = mean_squared_error(y_train, pred_train)
MSE_test = mean_squared_error(y_test, pred_test)

RMSE_train = np.sqrt(MSE_train)
RMSE_test = np.sqrt(MSE_test)

print("학습 데이터 RMSE : ", RMSE_train)
print("테스트 데이터 RMSE : ", RMSE_test)

학습 데이터 RMSE :  70070.74336818761
테스트 데이터 RMSE :  71678.22245407068


In [21]:
param_grid = {"kernel" : ['poly'], "C" : [0.01, 0.1], 
"gamma" : [0.01, 0.1]}

grid_search = GridSearchCV(SVC(), param_grid, cv = 5)
grid_search.fit(X_scaled_train, y_train)
# 시간이 너무 오래 걸려서 못함

In [None]:
print("Best Param : {}".format(grid_search.best_params_))
print("Best Score : {}".format(grid_search.best_score_))
print("Test score : {}".format(grid_search.score(X_scaled_test, y_test)))

In [None]:
param_dist = {'kernel' : ['poly'], "C" : randint(low = 0.01, high = 10), 
"gamma" : randint(low = 0.01, high = 10)}
randomSearch = RandomizedSearchCV(SVR(kernel="poly"), param_dist, n_iter=100, cv = 5)
randomSearch.fit(X_scaled_train, y_train)

In [None]:
print("Best Param : {}".format(randomSearch.best_params_))
print("Best Score : {}".format(randomSearch.best_score_))
print("Test score : {}".format(randomSearch.score(X_scaled_test, y_test)))