### SVM (Support Vecotr Machine)

- Wine data

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split

In [3]:
wine = load_wine()
df = pd.DataFrame(wine.data, columns=wine.feature_names)
df['target'] = wine.target
df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


In [4]:
df.shape

(178, 14)

In [5]:
df.target.value_counts()

1    71
0    59
2    48
Name: target, dtype: int64

In [6]:
wine.target_names

array(['class_0', 'class_1', 'class_2'], dtype='<U7')

- Feature 표준화

In [10]:
from sklearn.preprocessing import StandardScaler
wine_std = StandardScaler().fit_transform(wine.data)
wine_std

array([[ 1.51861254, -0.5622498 ,  0.23205254, ...,  0.36217728,
         1.84791957,  1.01300893],
       [ 0.24628963, -0.49941338, -0.82799632, ...,  0.40605066,
         1.1134493 ,  0.96524152],
       [ 0.19687903,  0.02123125,  1.10933436, ...,  0.31830389,
         0.78858745,  1.39514818],
       ...,
       [ 0.33275817,  1.74474449, -0.38935541, ..., -1.61212515,
        -1.48544548,  0.28057537],
       [ 0.20923168,  0.22769377,  0.01273209, ..., -1.56825176,
        -1.40069891,  0.29649784],
       [ 1.39508604,  1.58316512,  1.36520822, ..., -1.52437837,
        -1.42894777, -0.59516041]])

- Train/Test dataset 분리

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    wine_std, wine.target, test_size=0.2, stratify=wine.target, random_state=2023
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((142, 13), (36, 13), (142,), (36,))

- SVM 하이퍼파라미터

In [13]:
from sklearn.svm import SVC

svc = SVC(random_state=2023)
svc.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': 2023,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [14]:
params = {'C': [0.01, 0.1, 1, 10, 100]}

In [15]:
from sklearn.model_selection import GridSearchCV
grid_svc = GridSearchCV(
    svc, params, scoring='accuracy', cv=5
)
grid_svc.fit(X_train, y_train)

In [17]:
grid_svc.best_params_

{'C': 0.1}

In [18]:
params = {'C': [0.05, 0.08, 0.1, 0.2, 0.3, 0.5]}

In [19]:
grid_svc = GridSearchCV(
    svc, params, scoring='accuracy', cv=5
)
grid_svc.fit(X_train, y_train)

In [20]:
grid_svc.best_params_

{'C': 0.2}

In [22]:
grid_svc.best_score_

0.9790640394088671

In [23]:
params = {'C': [0.1, 0.12, 0.14, 0.16, 0.18, 0.2]}

In [24]:
grid_svc = GridSearchCV(
    svc, params, scoring='accuracy', cv=5
)
grid_svc.fit(X_train, y_train)

In [25]:
grid_svc.best_params_

{'C': 0.16}

In [27]:
best_svc = grid_svc.best_estimator_
best_svc.score(X_test, y_test)

0.9722222222222222

In [28]:
best_svc.predict(X_test)

array([2, 2, 2, 0, 1, 1, 1, 0, 2, 1, 2, 0, 2, 1, 1, 1, 1, 2, 1, 0, 1, 0,
       1, 0, 0, 1, 0, 2, 1, 1, 1, 2, 0, 0, 2, 0])

- predict_proba() method를 쓰려면, 하이퍼파라미터 probability=True 옵션 줘야함

In [30]:
svc2 = SVC(probability=True, random_state=2023)
svc2.fit(X_train, y_train)

In [31]:
svc2.predict_proba(X_test)

array([[6.87811936e-03, 7.32721083e-03, 9.85794670e-01],
       [2.91845859e-02, 5.36799685e-02, 9.17135446e-01],
       [1.15299028e-02, 2.44974259e-02, 9.63972671e-01],
       [9.93093292e-01, 1.20906203e-03, 5.69764568e-03],
       [1.34431896e-02, 9.52543784e-01, 3.40130260e-02],
       [2.74885039e-01, 6.83701749e-01, 4.14132121e-02],
       [4.36021265e-02, 7.10137583e-01, 2.46260291e-01],
       [9.55234774e-01, 2.96544551e-02, 1.51107704e-02],
       [2.74087217e-02, 1.21894107e-01, 8.50697171e-01],
       [2.12617574e-05, 9.96941351e-01, 3.03738690e-03],
       [1.86556562e-02, 2.23057832e-02, 9.59038561e-01],
       [9.79388328e-01, 1.01950584e-02, 1.04166141e-02],
       [1.62591878e-02, 2.41213507e-02, 9.59619461e-01],
       [1.22583020e-02, 9.79132627e-01, 8.60907091e-03],
       [8.79570963e-03, 9.63243964e-01, 2.79603267e-02],
       [1.51177104e-01, 8.36628272e-01, 1.21946237e-02],
       [2.08956047e-02, 9.59645813e-01, 1.94585819e-02],
       [5.88353620e-03, 5.37571

- Kaggle Red Wine Quality dataset

In [36]:
rw = pd.read_csv('data/winequality_red.csv')

In [38]:
rw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [41]:
print(rw.shape)
print(rw.quality.value_counts())

(1599, 12)
5    681
6    638
7    199
4     53
8     18
3     10
Name: quality, dtype: int64


In [44]:
rw['target'] = rw.quality.apply(lambda x: 1 if x>=6 else 0)
rw

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,target
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,0
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,0
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,0
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,1
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5,0
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6,1
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6,1
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5,0


In [45]:
rw.target.value_counts()

1    855
0    744
Name: target, dtype: int64

In [47]:
X = rw.iloc[:, :-2].values
y = rw.target.values

In [48]:
# 표준화
X_std = StandardScaler().fit_transform(X)

In [49]:
X_train, X_test, y_train, y_test = train_test_split(
    X_std, y, test_size=0.2, random_state=2023, stratify=y
)

In [50]:
svc = SVC(probability=True, random_state=2023)
params = {'C': [0.01, 0.1, 0.5, 1, 5, 10, 100]}
grid_svc = GridSearchCV(svc, params, scoring='accuracy', cv=5)
grid_svc.fit(X_train, y_train)
grid_svc.best_params_

{'C': 1}

In [53]:
grid_svc.best_estimator_.score(X_test, y_test)

0.790625

In [56]:
grid_svc.best_estimator_.predict_proba(X_test)[:5]

array([[0.0399958 , 0.9600042 ],
       [0.13782246, 0.86217754],
       [0.46369644, 0.53630356],
       [0.56697554, 0.43302446],
       [0.28080769, 0.71919231]])

In [57]:
y_test[:5]

array([1, 1, 1, 0, 1], dtype=int64)