### SVM(Support Vector Machine)

- Wine data

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_wine

In [2]:
wine = load_wine()
df = pd.DataFrame(wine.data, columns=wine.feature_names)
df['target'] = wine.target
df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


In [3]:
df.shape

(178, 14)

In [4]:
df.target.value_counts()

target
1    71
0    59
2    48
Name: count, dtype: int64

In [5]:
wine.target_names

array(['class_0', 'class_1', 'class_2'], dtype='<U7')

- Feature 표준화

In [7]:
from sklearn.preprocessing import StandardScaler

wine_std = StandardScaler().fit_transform(wine.data)


- Train/Test dataset 분리

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    wine_std, wine.target, stratify=wine.target, test_size=0.2, random_state=2023
)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((142, 13), (36, 13), (142,), (36,))

- SVM 하이퍼 파라메터

In [13]:
# probability : False -> 0, 1로 주겠다. predict_proba 소숫점으로 나눈다.
from sklearn.svm import SVC

svc = SVC(random_state=2023)
svc.get_params()


{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': 2023,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [27]:
params = {'C':[0.35, 0.4, 0.45, 0.5]}

In [28]:
from sklearn.model_selection import GridSearchCV
grid_svc = GridSearchCV(
    svc, params, scoring='accuracy', cv=5
)
grid_svc.fit(X_train, y_train)

In [29]:
grid_svc.best_params_

{'C': 0.4}

In [30]:
best_svc = grid_svc.best_estimator_
best_svc.score(X_test, y_test)

0.9722222222222222

In [31]:
best_svc.predict(X_test[:5])

array([2, 2, 2, 0, 1])

- predict_proba() method를 사용하려면 하이퍼파라메터 probability를 True로 지정해야 함

In [32]:
svc2 = SVC(probability=True, random_state=2023)
svc2.fit(X_train, y_train)


In [33]:
svc2.predict_proba(X_test[:5])

array([[0.00687812, 0.00732721, 0.98579467],
       [0.02918459, 0.05367997, 0.91713545],
       [0.0115299 , 0.02449743, 0.96397267],
       [0.99309329, 0.00120906, 0.00569765],
       [0.01344319, 0.95254378, 0.03401303]])

- Kaggle Red Wine Quality dataset

In [37]:
rw = pd.read_csv('../data/winequality-red.csv')
rw.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [35]:
rw.shape, rw.quality.value_counts()

((1599, 12),
 quality
 5    681
 6    638
 7    199
 4     53
 8     18
 3     10
 Name: count, dtype: int64)

In [38]:
rw.isna().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [39]:
# Good / Poor 2진 등급으로 분류
rw['target'] = rw.quality.apply(lambda x: 1 if x >= 6 else 0)
rw.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,target
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,1
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0


In [40]:
X = rw.iloc[:, :-2].values
y = rw.target.values

In [41]:
# 표준화 
X_std = StandardScaler().fit_transform(X)

In [42]:
X_train, X_test, y_train, y_test = train_test_split(
    X_std, y, stratify=y, test_size=0.2, random_state=2023
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1279, 11), (320, 11), (1279,), (320,))

In [46]:
svc = SVC(probability=True, random_state=2023)
params = {'C': [0.9, 1, 1.1]}
grid_svc = GridSearchCV(svc, params, scoring='accuracy', cv=5)
grid_svc.fit(X_train, y_train)
grid_svc.best_params_

{'C': 1}

In [47]:
grid_svc.best_estimator_.score(X_test, y_test)

0.790625

In [48]:
grid_svc.best_estimator_.predict_proba(X_test[:5])

array([[0.0399958 , 0.9600042 ],
       [0.13782247, 0.86217753],
       [0.46369645, 0.53630355],
       [0.56697554, 0.43302446],
       [0.2808077 , 0.7191923 ]])

In [49]:
y_test[:5]

array([1, 1, 1, 0, 1], dtype=int64)