In [19]:
# (1) 필요한 모듈 import하기
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd

# (2) 데이터 준비
digits = load_digits()

# (3) 데이터 이해하기
X = digits.data # Feature Data 지정하기
print("Data, Feature :", X.shape)
y = digits.target # Label Data 지정하기
print(f"Target Names : {digits.target_names}")  # Target Names 출력

digits_df = pd.DataFrame(X)
digits_df.describe() # 데이타 Describe 

# (4) train, test 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Data, Feature : (1797, 64)
Target Names : [0 1 2 3 4 5 6 7 8 9]


데이터셋 특징 : 
1. digits 데이터셋의 경우 8x8 픽셀 이미지를 1차원으로 펼친 64개의 특성을 가짐  
2. 각 샘플이 0부터 9까지의 숫자 중 하나로 레이블링됨

# (5)(6) 개별 모델 학습 및 평가

In [39]:
# 1. Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)
print("\nDecision Tree 모델 평가 결과:")
print(classification_report(y_test, dt_pred, digits=3))


Decision Tree 모델 평가 결과:
              precision    recall  f1-score   support

           0      0.967     0.879     0.921        33
           1      0.846     0.786     0.815        28
           2      0.857     0.727     0.787        33
           3      0.763     0.853     0.806        34
           4      0.840     0.913     0.875        46
           5      0.889     0.851     0.870        47
           6      0.970     0.914     0.941        35
           7      0.816     0.912     0.861        34
           8      0.750     0.700     0.724        30
           9      0.750     0.825     0.786        40

    accuracy                          0.842       360
   macro avg      0.845     0.836     0.838       360
weighted avg      0.846     0.842     0.842       360



In [40]:
# 2.Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
print("\nRandom Forest 모델 평가 결과:")
print(classification_report(y_test, rf_pred, digits=3))


Random Forest 모델 평가 결과:
              precision    recall  f1-score   support

           0      1.000     0.970     0.985        33
           1      0.966     1.000     0.982        28
           2      1.000     1.000     1.000        33
           3      1.000     0.941     0.970        34
           4      0.979     1.000     0.989        46
           5      0.938     0.957     0.947        47
           6      0.971     0.971     0.971        35
           7      0.971     0.971     0.971        34
           8      0.967     0.967     0.967        30
           9      0.950     0.950     0.950        40

    accuracy                          0.972       360
   macro avg      0.974     0.973     0.973       360
weighted avg      0.973     0.972     0.972       360



In [41]:
# 3. SVM
svm = SVC(random_state=42)
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)
print("\nSVM 모델 평가 결과:")
print(classification_report(y_test, svm_pred, digits=3))


SVM 모델 평가 결과:
              precision    recall  f1-score   support

           0      1.000     1.000     1.000        33
           1      1.000     1.000     1.000        28
           2      1.000     1.000     1.000        33
           3      1.000     1.000     1.000        34
           4      1.000     1.000     1.000        46
           5      0.979     0.979     0.979        47
           6      0.972     1.000     0.986        35
           7      0.971     0.971     0.971        34
           8      1.000     0.967     0.983        30
           9      0.950     0.950     0.950        40

    accuracy                          0.986       360
   macro avg      0.987     0.987     0.987       360
weighted avg      0.986     0.986     0.986       360



In [43]:
# 4. SGD Classifier
sgd = SGDClassifier(random_state=42)
sgd.fit(X_train, y_train)
sgd_pred = sgd.predict(X_test)
print("\nSGD 모델 평가 결과:")
print(classification_report(y_test, sgd_pred, digits=3))


SGD 모델 평가 결과:
              precision    recall  f1-score   support

           0      1.000     0.970     0.985        33
           1      0.931     0.964     0.947        28
           2      0.971     1.000     0.985        33
           3      1.000     0.971     0.985        34
           4      1.000     1.000     1.000        46
           5      0.935     0.915     0.925        47
           6      0.944     0.971     0.958        35
           7      0.971     0.971     0.971        34
           8      0.897     0.867     0.881        30
           9      0.902     0.925     0.914        40

    accuracy                          0.956       360
   macro avg      0.955     0.955     0.955       360
weighted avg      0.956     0.956     0.956       360



In [42]:
# 5. Logistic Regression
lr = LogisticRegression(random_state=42)
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)
print("\nLogistic Regression 모델 평가 결과:")
print(classification_report(y_test, lr_pred, digits=3))


Logistic Regression 모델 평가 결과:
              precision    recall  f1-score   support

           0      1.000     1.000     1.000        33
           1      0.966     1.000     0.982        28
           2      0.971     1.000     0.985        33
           3      0.971     0.971     0.971        34
           4      1.000     0.957     0.978        46
           5      0.917     0.936     0.926        47
           6      0.944     0.971     0.958        35
           7      1.000     0.971     0.985        34
           8      0.967     0.967     0.967        30
           9      0.974     0.950     0.962        40

    accuracy                          0.969       360
   macro avg      0.971     0.972     0.971       360
weighted avg      0.970     0.969     0.970       360



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [38]:
# 모델 성능 비교 프로세스
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
models = {'Decision Tree': dt,'Random Forest': rf,'SVM': svm,'SGD': sgd,'Logistic Regression': lr}
results = []
for name, model in models.items():
    pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred, average='weighted')
    recall = recall_score(y_test, pred, average='weighted')
    f1 = f1_score(y_test, pred, average='weighted') 
    results.append({'Model': name, 'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1-score': f1})
results_df = pd.DataFrame(results, index=range(1, 6))
print("\n손글씨 데이터 모델별 성능 비교:")
print(results_df.round(3))



손글씨 데이터 모델별 성능 비교:
                 Model  Accuracy  Precision  Recall  F1-score
1        Decision Tree     0.842      0.846   0.842     0.842
2        Random Forest     0.972      0.973   0.972     0.972
3                  SVM     0.986      0.986   0.986     0.986
4                  SGD     0.956      0.956   0.956     0.956
5  Logistic Regression     0.969      0.970   0.969     0.970


![image.png](attachment:image.png)

# 평가 비교

### 평가지표 선택 = Accuracy
- 손글씨 숫자 특성상 비교적 균형잡힌 분포의 데이타셋이고   
- 모든 클래스가(0~9)가 비슷한 중요도를 가짐짐
- 단순히 맞냐 틀리냐로 평가하는 것이 중요하기 때문에 

### 평가모델 선택 = SVM
- 데이터셋이 64차원(8x8)인데 SVM은 고차원 데이타에 강하다.  
- 모든 평가지표에서 일관되게 높은 성능을 보였다.


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

models = {
    'Decision Tree': dt,
    'Random Forest': rf,
    'SVM': svm,
    'SGD': sgd,
    'Logistic Regression': lr
}

results = []
for name, model in models.items():
    pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred, average='weighted')
    recall = recall_score(y_test, pred, average='weighted')
    f1 = f1_score(y_test, pred, average='weighted')
    
    results.append({
        'Model': name,
        'Accuracy': f'{accuracy*100:.1f}%',
        'Precision': f'{precision*100:.1f}%',
        'Recall': f'{recall*100:.1f}%',
        'F1-score': f'{f1*100:.1f}%'
    })

results_df = pd.DataFrame(results, index=range(1, 6))

numeric_columns = ['Accuracy', 'Precision', 'Recall', 'F1-score']
for col in numeric_columns:
    results_df[col] = results_df[col].str.rstrip('%').astype(float)

plt.figure(figsize=(12, 6))

x = np.arange(len(results_df))
width = 0.2

plt.bar(x - width*1.5, results_df['Accuracy'], width, label='Accuracy', color='skyblue')
plt.bar(x - width/2, results_df['Precision'], width, label='Precision', color='lightgreen')
plt.bar(x + width/2, results_df['Recall'], width, label='Recall', color='salmon')
plt.bar(x + width*1.5, results_df['F1-score'], width, label='F1-score', color='orange')

plt.xlabel('Models', fontsize=12)
plt.ylabel('Performance (%)', fontsize=12)
plt.title('1.Digits_Model Performance Comparison', fontsize=18, pad=20)
plt.xticks(x, results_df['Model'], rotation=0)
plt.legend(fontsize=14)  # 범례 폰트 크기 조정
plt.tick_params(axis='both', labelsize=12)  # x, y축 눈금 라벨 크기 조정
plt.grid(True, alpha=0.3)

plt.ylim(80, 100)

plt.tight_layout()

plt.show()