In [1]:
# (1) 필요한 모듈 import

from sklearn.datasets import load_digits                  # 손글씨 데이터
from sklearn.model_selection import train_test_split      # 훈련용데이터, 검증용데이터 분리
from sklearn.metrics import classification_report         # 지표 확인

In [2]:
# (2) 데이터 준비

digits = load_digits()

In [3]:
# (3) 데이터 이해하기

digits_data = digits.data        # 학습지
digits_label = digits.target     # 정답지

digits.target_names    # 정답지: 0 ~ 9

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [4]:
digits_data.shape, digits_label.shape    # 정보가 1797개 담겨있다는 사실 확인할 수 있음

((1797, 64), (1797,))

In [5]:
import pandas as pd

digits_df = pd.DataFrame(data = digits.data, columns = digits.feature_names)    # 8 X 8 이미지의 데이터
digits_df.describe()

Unnamed: 0,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,pixel_1_1,...,pixel_6_6,pixel_6_7,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6,pixel_7_7
count,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,...,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0
mean,0.0,0.30384,5.204786,11.835838,11.84808,5.781859,1.36227,0.129661,0.005565,1.993879,...,3.725097,0.206455,0.000556,0.279354,5.557596,12.089037,11.809126,6.764051,2.067891,0.364496
std,0.0,0.907192,4.754826,4.248842,4.287388,5.666418,3.325775,1.037383,0.094222,3.19616,...,4.919406,0.984401,0.02359,0.934302,5.103019,4.374694,4.933947,5.900623,4.090548,1.860122
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,1.0,10.0,10.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,11.0,10.0,0.0,0.0,0.0
50%,0.0,0.0,4.0,13.0,13.0,4.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,4.0,13.0,14.0,6.0,0.0,0.0
75%,0.0,0.0,9.0,15.0,15.0,11.0,0.0,0.0,0.0,3.0,...,7.0,0.0,0.0,0.0,10.0,16.0,16.0,12.0,2.0,0.0
max,0.0,8.0,16.0,16.0,16.0,16.0,16.0,15.0,2.0,16.0,...,16.0,13.0,1.0,9.0,16.0,16.0,16.0,16.0,16.0,16.0


In [6]:
# (4) train, test 데이터 분리

X_train, X_test, y_train, y_test = train_test_split(digits_data, digits_label, test_size=0.2, random_state=11)

### 1) Decision Tree

In [7]:
from sklearn.tree import DecisionTreeClassifier
decision_tree = DecisionTreeClassifier(random_state=20)

decision_tree.fit(X_train, y_train)    # 학습시키기

DecisionTreeClassifier(random_state=20)

In [8]:
y_pred = decision_tree.predict(X_test)    # test데이터로 예측하기

In [9]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.87      0.93        38
           1       0.86      0.86      0.86        37
           2       0.94      0.82      0.88        39
           3       0.80      0.80      0.80        41
           4       0.85      0.83      0.84        41
           5       0.81      0.96      0.88        27
           6       0.94      0.97      0.95        30
           7       0.91      0.83      0.87        36
           8       0.66      0.79      0.72        34
           9       0.76      0.78      0.77        37

    accuracy                           0.85       360
   macro avg       0.85      0.85      0.85       360
weighted avg       0.86      0.85      0.85       360



### 2) Random Forest

In [10]:
from sklearn.ensemble import RandomForestClassifier

In [11]:
random_forest = RandomForestClassifier(random_state = 20)

random_forest.fit(X_train, y_train)

RandomForestClassifier(random_state=20)

In [12]:
y_pred = random_forest.predict(X_test)

In [13]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.97      0.99        38
           1       1.00      1.00      1.00        37
           2       0.97      1.00      0.99        39
           3       1.00      0.93      0.96        41
           4       0.98      0.98      0.98        41
           5       0.93      1.00      0.96        27
           6       1.00      0.97      0.98        30
           7       0.92      0.97      0.95        36
           8       0.94      0.97      0.96        34
           9       0.97      0.95      0.96        37

    accuracy                           0.97       360
   macro avg       0.97      0.97      0.97       360
weighted avg       0.97      0.97      0.97       360



### 3) SVM

In [14]:
from sklearn import svm
svm_model = svm.SVC()

svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        38
           1       1.00      1.00      1.00        37
           2       1.00      1.00      1.00        39
           3       1.00      0.93      0.96        41
           4       1.00      1.00      1.00        41
           5       0.96      1.00      0.98        27
           6       1.00      1.00      1.00        30
           7       0.95      0.97      0.96        36
           8       0.97      1.00      0.99        34
           9       0.97      0.97      0.97        37

    accuracy                           0.99       360
   macro avg       0.99      0.99      0.99       360
weighted avg       0.99      0.99      0.99       360



### 4) SGD Classifier

In [15]:
from sklearn.linear_model import SGDClassifier
sgd_model = SGDClassifier()
sgd_model.fit(X_train, y_train)
y_pred = sgd_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        38
           1       1.00      0.84      0.91        37
           2       1.00      1.00      1.00        39
           3       1.00      0.95      0.97        41
           4       1.00      0.98      0.99        41
           5       0.84      1.00      0.92        27
           6       1.00      0.97      0.98        30
           7       0.85      0.97      0.91        36
           8       0.87      0.97      0.92        34
           9       1.00      0.89      0.94        37

    accuracy                           0.96       360
   macro avg       0.96      0.96      0.95       360
weighted avg       0.96      0.96      0.96       360



### 5) Logistic Regression

In [16]:
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)
y_pred = logistic_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        38
           1       1.00      0.95      0.97        37
           2       1.00      1.00      1.00        39
           3       1.00      0.93      0.96        41
           4       1.00      0.98      0.99        41
           5       0.93      1.00      0.96        27
           6       1.00      0.97      0.98        30
           7       0.95      0.97      0.96        36
           8       0.84      0.91      0.87        34
           9       0.95      0.97      0.96        37

    accuracy                           0.97       360
   macro avg       0.97      0.97      0.97       360
weighted avg       0.97      0.97      0.97       360



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


첫 번째로 사용했던 Decision Tree 모델은 정확도가 85%로 다른 모델보다 현저하게 떨어지고,  
특히 숫자 8의 경우 정밀도가 66%로 매우 떨어짐을 확인할 수 있다.  
그 외의 숫자에서도 심심치 않게 70~80%대의 정밀도와 재현율을 확인할 수 있어 적합지 못한 모델이라고 판단할 수 있다.  

세 번째로 사용했던 SVM 모델은 정확도가 99%로 아주 높게 나타났다.  
정밀도와 재현율을 각각 살펴보아도 100%를 기록한 부분이 많았고, 거의 모두 90% 중후반대를 기록하고 있어 적합한 모델이라고 볼 수 있다.

나머지 모델들 또한 정확도 면에서 각각 97%, 96%, 97%를 기록하여 상당히 높은 점수를 받았지만,  
SGD Classifier나 Logistic Regression에서는 정밀도와 재현율에서 80%대를 기록한 부분이 있어 적절치 못하다.  

손글씨 분류 문제는 실제 손글씨를 입력했을 때 실제 손글씨가 뜻하는 바를 잘못 인지하는 비율이 낮아야 한다.  
따라서 양성을 음성으로 판단하는 비율이 적어야 한다. 이때에는 재현율을 기준으로 평가하는 것이 좋다.  

recall의 weighted avg를 기준으로 보면 1)85%  2)97%  3)99%  4)96%  5)97%로 세 번째 모델(SVM)을 선택하는 것이 가장 좋다.