# 성능지표 - 분류
- 정확도 (Accuracy) : 전체 정답에서 맞춘 정답 수
- 정밀도 (Precision) : 모델(예측) 기준으로 모델이 True라고 한 것이 정답(실제)도 True인 것
- 재현율 (Recall) : 정답(실제) 기준으로 정답(실제)이 True 라고 한 것이 모델(예측)도 True인 것
- F1-Score : 정밀도와 재현율 2가지를 조합한 점수
- 오차행렬 (Confusion Matrix) : 정답과 예측값의 관계 나타내는 표 

## [1] 모듈 로딩 및 데이터 준비

In [59]:
# 모듈 로딩
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score # sklearn에서 제공하는 모델 성능평가 관련 모듈

import pandas as pd
import numpy as np

In [60]:
# 데이터 로딩
digit_bunch = load_digits(as_frame=True)

## [2] 피쳐와 레이블 추출

In [61]:
# 0 ~ 9까지 이미지 데이터 값
img_df = digit_bunch['data']
img_df

Unnamed: 0,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,pixel_1_1,...,pixel_6_6,pixel_6_7,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6,pixel_7_7
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,9.0,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1792,0.0,0.0,4.0,10.0,13.0,6.0,0.0,0.0,0.0,1.0,...,4.0,0.0,0.0,0.0,2.0,14.0,15.0,9.0,0.0,0.0
1793,0.0,0.0,6.0,16.0,13.0,11.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,6.0,16.0,14.0,6.0,0.0,0.0
1794,0.0,0.0,1.0,11.0,15.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,9.0,13.0,6.0,0.0,0.0
1795,0.0,0.0,2.0,10.0,7.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,5.0,12.0,16.0,12.0,0.0,0.0


In [62]:
# 0 ~ 9 라벨
label_sr = digit_bunch['target']
label_sr.unique()

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [63]:
# 2진 분류를 위해서 7과 나머지 라벨 변경 
mask = (label_sr==7)
~mask # 7아닌 것만 True
label_sr[~mask]

0       0
1       1
2       2
3       3
4       4
       ..
1792    9
1793    0
1794    8
1795    9
1796    8
Name: target, Length: 1618, dtype: int32

In [91]:
mask

0       False
1       False
2       False
3       False
4       False
        ...  
1792    False
1793    False
1794    False
1795    False
1796    False
Name: target, Length: 1797, dtype: bool

In [64]:
label_sr[~mask] = 0
label_sr

0       0
1       0
2       0
3       0
4       0
       ..
1792    0
1793    0
1794    0
1795    0
1796    0
Name: target, Length: 1797, dtype: int32

In [65]:
label_sr[mask] = 1

In [66]:
label_sr.value_counts()

target
0    1618
1     179
Name: count, dtype: int64

In [67]:
# label_sr = label_sr
# label_sr2 = (label_sr == 7).astpye(int)
# label_sr2.value_counts()

## [3] 학습용 데이터셋 준비

In [68]:
x_train, x_test, y_train, y_test = train_test_split(img_df, label_sr, stratify=label_sr, random_state=1)
# stratify : 데이터 분포 비율 맞춤 

In [69]:
print('[Train]', y_train.value_counts() / y_train.shape[0])
print('[Test]', y_test.value_counts() / y_test.shape[0])

[Train] target
0    0.90052
1    0.09948
Name: count, dtype: float64
[Test] target
0    0.9
1    0.1
Name: count, dtype: float64


## [4] 클래스 정의

In [70]:
# 사용자 정의 클래스 DummyClass : 아무것도 하지 않는 클래스, 단순 테스트용 
from sklearn.base import BaseEstimator

class MyClassifier(BaseEstimator):

    def fit(self, x,y): pass
    
    def predict(self, x) : return np.zeros((len(x),1), dtype=bool)

np.zeros()를 통해 모든 것을 0으로 예측해도 정확도가 90% 

-> 신뢰 불가

## [5] 학습

In [71]:
model = MyClassifier()
model.fit(x_train, y_train)

## [6] 예측 및 성능 평가

In [72]:
pre = model.predict(x_train)
pre


array([[False],
       [False],
       [False],
       ...,
       [False],
       [False],
       [False]])

In [73]:
accuracy_score = accuracy_score(y_train, pre)
accuracy_score

0.9005196733481812

### [6-1] 정확도 => 불균형 데이터 경우 신뢰 불가

### [6-2] 오차행렬

In [None]:
from sklearn.metrics import confusion_matrix, classification_report                                                       l'

In [None]:
# 정답과 예측값 전달 
confusion_matrix(y_train, pre, labels=[0,1])
# confusion_matrix : 오차행렬

array([[1213,    0],
       [ 134,    0]], dtype=int64)

In [79]:
y_train.value_counts()

target
0    1213
1     134
Name: count, dtype: int64

In [80]:
y_test.value_counts()

target
0    405
1     45
Name: count, dtype: int64

In [77]:
# 정답과 예측값 전달
(tn, fp, fn, tp) = confusion_matrix(y_train, pre).reshape(-1)
print(tn, fp, fn, tp)

1213 0 134 0


False(0)이 1213개, True(1)이 134개

#### 정밀도 (precision) 모델(예측) 중심

In [82]:

from sklearn.metrics import precision_score

precision_score(y_train,pre, zero_division=0)

0.0

UndefinedMetricWarning -> zero_division=0

#### 재현율(Recall) : 정답(실제) 중심

In [83]:
from sklearn.metrics import recall_score

recall_score(y_train,pre)

0.0

----------------------------------

In [84]:
y_test = [0,0,0,1,1, 1,0,0,0,1,  1,1,1,0,0,  0,0,0,0,0]
y_pre  = [1,1,1,1,1, 1,0,1,0,1,  1,1,1,0,0,  0,0,0,0,0]

In [86]:
# 정답과 예측값 전달
(tn, fp, fn, tp) = confusion_matrix(y_test, y_pre).reshape(-1)
print(tn, fp, fn, tp)

9 4 0 7


In [87]:
confusion_matrix(y_test, y_pre) 

array([[9, 4],
       [0, 7]], dtype=int64)

In [89]:
from sklearn.metrics import precision_score 

precision_score(y_test, y_pre)

0.6363636363636364

In [90]:
from sklearn.metrics import recall_score

recall_score(y_test, y_pre)

1.0

정밀도와 재현율은 동시에 좋아질 수 없음

-> F1_SCORE 확인