## 2.Load_wine

### (1) 데이터준비

In [1]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [2]:
# 데이터 로드
wine = load_wine()
print(dir(wine))

['DESCR', 'data', 'feature_names', 'frame', 'target', 'target_names']


In [3]:
#정보확인
wine.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names'])

In [4]:
#breast_cancer 의 feature, label Target names 지정
#feature
wine_data = wine.data

#label
wine_label= wine.target

print(wine_data.shape)
print(wine_label.shape)

(178, 13)
(178,)


In [5]:
print(wine.DESCR)

.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178 (50 in each of three classes)
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 162.0    99.7  14.3
    Total Phenols:                0

### (2) train, test 데이터 분리

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(wine_data,
                                                    wine_label,
                                                    test_size=0.2,
                                                    random_state=7)
print('X_train 개수: ',len(X_train), ',X_test 개수:', len(X_test))

X_train 개수:  142 ,X_test 개수: 36


In [7]:
X_train.shape, y_train.shape

((142, 13), (142,))

In [8]:
y_test

array([0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 2, 0, 1, 0, 0, 0, 2, 1, 2, 2,
       0, 0, 1, 1, 1, 2, 2, 1, 0, 2, 2, 1, 0, 2])

### (3) 다양한 모델로 학습 및 평가

- Decision Tree 사용해 보기 

In [9]:
#모델 학습
decision_tree = DecisionTreeClassifier(random_state=32) 
decision_tree.fit(X_train, y_train) 
y_pred = decision_tree.predict(X_test)


# 평가
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.81      1.00      0.90        13
           1       1.00      0.71      0.83        14
           2       0.90      1.00      0.95         9

    accuracy                           0.89        36
   macro avg       0.90      0.90      0.89        36
weighted avg       0.91      0.89      0.88        36

[[13  0  0]
 [ 3 10  1]
 [ 0  0  9]]


- Random Forest 사용해 보기

In [10]:
random_forest = RandomForestClassifier(random_state=32) #요기변경
random_forest.fit(X_train, y_train)#요기변경
y_pred = random_forest.predict(X_test)#요기변경

# 평가
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       1.00      0.93      0.96        14
           2       0.90      1.00      0.95         9

    accuracy                           0.97        36
   macro avg       0.97      0.98      0.97        36
weighted avg       0.98      0.97      0.97        36

[[13  0  0]
 [ 0 13  1]
 [ 0  0  9]]


- SVM 사용해 보기

In [11]:
svm_model = svm.SVC()
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)

# 평가
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.92      0.85      0.88        13
           1       0.56      0.64      0.60        14
           2       0.25      0.22      0.24         9

    accuracy                           0.61        36
   macro avg       0.58      0.57      0.57        36
weighted avg       0.61      0.61      0.61        36

[[11  0  2]
 [ 1  9  4]
 [ 0  7  2]]


- SGD Classifier 사용해 보기

In [12]:
sgd_model = SGDClassifier()
sgd_model.fit(X_train, y_train)
y_pred = sgd_model.predict(X_test)

# 평가
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.77      0.87        13
           1       1.00      0.14      0.25        14
           2       0.38      1.00      0.55         9

    accuracy                           0.58        36
   macro avg       0.79      0.64      0.56        36
weighted avg       0.84      0.58      0.55        36

[[10  0  3]
 [ 0  2 12]
 [ 0  0  9]]


- Logistic Regression 사용해 보기

In [13]:
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)
y_pred = logistic_model.predict(X_test)

# 평가
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.92      0.96        13
           1       0.93      0.93      0.93        14
           2       0.90      1.00      0.95         9

    accuracy                           0.94        36
   macro avg       0.94      0.95      0.95        36
weighted avg       0.95      0.94      0.94        36

[[12  1  0]
 [ 0 13  1]
 [ 0  0  9]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### (4) 모델을 평가해 보기

- 총 178개의 데이터에서, 13개의 특성을 가지고 0,1,2번 와인을 구분하는 문제이다.
- Random Forest의 진단이 확률이 100%로 가장 좋다.
- SGD Classifier, SVM의 확률은 90%가 되지 않는다.
- SVM은 선형적으로 구분되지 않는 데이터를 선형으로 구분하기 위해서 고차원으로 투영하는 방법인데 선형이기 때문에 데이터가 변화되어 오히려 정확도가 떨어진 듯하다.