# 손글씨를 분류해 봅시다

### 1. 필요한 모듈 import

In [203]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

### 2. 손글씨 데이터셋 준비

In [32]:
digits = load_digits() 

### 3. 데이터 이해하기

In [225]:
digits_feature = digits.feature_names # Feature Data 
# print(f'features 갯수 : {len(digits_feature)}')
# print(f'feature 데이터 : {digits_feature}') # 

digits_data = digits.data # data
# print(digits_data)

digits_label = digits.target # Label Data

digits_target = digits.target_names # Target Names

print(f'Target Names : {digits_target}')

Target Names : [0 1 2 3 4 5 6 7 8 9]


In [119]:
print(digits.DESCR) # 데이터 Describe

# 데이터셋은 손글씨 이미지를 포함하고 있음 
# 이미지는 8x8 이미지 픽셀로 이뤄져 있음
# 10개 분류의 숫자를 의미함

.. _digits_dataset:

Optical recognition of handwritten digits dataset
--------------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 5620
    :Number of Attributes: 64
    :Attribute Information: 8x8 image of integer pixels in the range 0..16.
    :Missing Attribute Values: None
    :Creator: E. Alpaydin (alpaydin '@' boun.edu.tr)
    :Date: July; 1998

This is a copy of the test set of the UCI ML hand-written digits datasets
https://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits

The data set contains images of hand-written digits: 10 classes where
each class refers to a digit.

Preprocessing programs made available by NIST were used to extract
normalized bitmaps of handwritten digits from a preprinted form. From a
total of 43 people, 30 contributed to the training set and different 13
to the test set. 32x32 bitmaps are divided into nonoverlapping blocks of
4x4 and the number of on pixels are counted in each blo

### 4. train, test 데이터 분리

In [131]:
digits_X_train, digits_X_test, digits_y_train, digits_y_test = train_test_split(digits.data,
                                                                                digits_label,
                                                                                test_size=0.2,
                                                                                random_state=7)

#### 모델 함수 준비

###### Decision Tree 

In [153]:
from sklearn.tree import DecisionTreeClassifier
    
def runDecisionTree(X_train, y_train, X_test):
    decision_tree = DecisionTreeClassifier(random_state=32)
    decision_tree.fit(X_train, y_train)
    y_pred = decision_tree.predict(X_test)
    return y_pred

###### Random Forest 사용해 보기

In [154]:
from sklearn.ensemble import RandomForestClassifier

def runRandomForest(X_train, y_train, X_test):
    random_forest = RandomForestClassifier(random_state=32)
    random_forest.fit(X_train, y_train)
    y_pred = random_forest.predict(X_test)
    return y_pred

###### SVM 사용해 보기

In [155]:
from sklearn import svm

def runSVM(X_train, y_train, X_test):
    svm_model = svm.SVC()
    svm_model.fit(X_train, y_train)
    y_pred = svm_model.predict(X_test)
    return y_pred

###### SGD Classifier

In [156]:
from sklearn.linear_model import SGDClassifier

def runSGDClassifier(X_train, y_train, X_test):
    sgd_model = SGDClassifier()
    sgd_model.fit(X_train, y_train)
    y_pred = sgd_model.predict(X_test)
    return y_pred

###### Logistic Regression

In [158]:
from sklearn.linear_model import LogisticRegression

def runLogisticRegression(X_train, y_train, X_test, max_iter=6000):
    logistic_model = LogisticRegression(solver='lbfgs', max_iter=max_iter)
    logistic_model.fit(X_train, y_train)
    y_pred = logistic_model.predict(X_test)
    return y_pred

### 5. 다양한 모델로 학습시켜보기

In [161]:
digits_y_pred_decision_tree = runDecisionTree(digits_X_train, digits_y_train, digits_X_test) # Decision Tree
digits_y_pred_random_forest = runRandomForest(digits_X_train, digits_y_train, digits_X_test) # Random Forest
digits_y_pred_svm = runSVM(digits_X_train, digits_y_train, digits_X_test) # SVM
digits_y_pred_sgd = runSGDClassifier(digits_X_train, digits_y_train, digits_X_test) # SGD Classifier
digits_y_pred_logistic_regression = runLogisticRegression(digits_X_train, digits_y_train, digits_X_test) # Logistic Regression

### 6. 모델을 평가해보기

In [162]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

print('Decision Tree')
print(classification_report(digits_y_test, digits_y_pred_decision_tree))
print('-----------------------------------------------------------')

print('Random Forest')
print(classification_report(digits_y_test, digits_y_pred_random_forest))
print('-----------------------------------------------------------')

print('SVM')
print(classification_report(digits_y_test, digits_y_pred_svm))
print('-----------------------------------------------------------')

print('SDG Classifier')
print(classification_report(digits_y_test, digits_y_pred_sgd))
print('-----------------------------------------------------------')

print('Logistic Regression')
print(classification_report(digits_y_test, digits_y_pred_logistic_regression))
print('-----------------------------------------------------------')

Decision Tree
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        43
           1       0.81      0.81      0.81        42
           2       0.79      0.82      0.80        40
           3       0.79      0.91      0.85        34
           4       0.83      0.95      0.89        37
           5       0.90      0.96      0.93        28
           6       0.84      0.93      0.88        28
           7       0.96      0.82      0.89        33
           8       0.88      0.65      0.75        43
           9       0.78      0.78      0.78        32

    accuracy                           0.86       360
   macro avg       0.86      0.86      0.86       360
weighted avg       0.86      0.86      0.85       360

-----------------------------------------------------------
Random Forest
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        43
           1       0.93      1.00      0.97 

In [208]:
# 가장 평가 수치가 높은 svm 기준 confusion 행렬 확인

confusion_matrix(digits_y_test, digits_y_pred_svm)

array([[43,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0, 42,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0, 40,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0, 34,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0, 37,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0, 28,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0, 28,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0, 33,  0,  0],
       [ 0,  2,  0,  0,  0,  1,  0,  0, 40,  0],
       [ 0,  0,  0,  0,  0,  1,  0,  0,  0, 31]])

### 평가지표
    
손글씨 문제에서는 ***Accuracy*** 평가지표가 중요합니다.

물론 데이터가 불균형한 경우에는 Accuracy에 허점이 있기 때문에

학습 데이터 분류가 어떻게 분포되어 있는지 확인하는게 좋습니다.

그래서 학습 데이터(=y_train)가 균형적으로 고루 분포되어 있는지 확인해보면, 

In [181]:
digits_y_train_df = pd.DataFrame(data=digits_y_train)
train_grouped = digits_y_train_df.groupby(0)
print(train_grouped.size())

0
0    135
1    140
2    137
3    149
4    144
5    154
6    153
7    146
8    131
9    148
dtype: int64


가장 적은 수는 8 (131개), 가장 많은 수는 5 (154개)로서, 크게 치중되는 것 없이 고루 분포되어 있습니다. 

또한 손글씨 데이터에서는 더 중요한 것을 따로 고르기보단, 각 분류가 올바르게 맞았는지를 보기 위함이 중요하다 생각합니다. 

1. 데이터가 클래스 별 고루 분포되어 있고
2. 올바르게 잘 판단했는지 (TP, TN가 높은) 

따라서, 위의 두 기준으로 ***Accuracy*** 를 선택했습니다.

---

# 와인을 분류해 봅시다

### 1. 필요한 모듈 import하기

In [45]:
from sklearn.datasets import load_wine

### 2. 필요한 데이터셋 준비

In [46]:
wine = load_wine()

### 3. 데이터 이해하기

In [167]:
wine_feature = wine.feature_names # Feature Data

wine_label = wine.target # Label Data

wine_target = wine.target_names # Target Names
print(f'Target Names : {target_names}')

Target Names : [0 1 2 3 4 5 6 7 8 9]


In [257]:
# 데이터 & feature & label 이해

wine_data_df = pd.DataFrame(data=wine.data, columns=wine_feature)
wine_data_df['label'] = wine_label
wine_data_df

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,label
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0,2
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0,2
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0,2
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0,2


In [256]:
# Label 데이터 확인
# :Class Distribution: class_0 (59), class_1 (71), class_2 (48)

wine_label_df = pd.Series(wine_label)
wine_label_df.value_counts()

1    71
0    59
2    48
dtype: int64

In [50]:
print(wine.DESCR) # 데이터 Describe

.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178 (50 in each of three classes)
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 162.0    99.7  14.3
    Total Phenols:                0

### 4. train, test 데이터 분리

In [83]:
wine_X_train, wine_X_test, wine_y_train, wine_y_test = train_test_split(wine.data,
                                                                        wine_label,
                                                                        test_size=0.2,
                                                                        random_state=7)

### 5. 다양한 모델로 학습시켜보기

In [173]:
wine_y_pred_decision_tree = runDecisionTree(wine_X_train, wine_y_train, wine_X_test) # Decision Tree
wine_y_pred_random_forest = runRandomForest(wine_X_train, wine_y_train, wine_X_test) # Random Forest
wine_y_pred_svm = runSVM(wine_X_train, wine_y_train, wine_X_test) # SVM
wine_y_pred_sgd = runSGDClassifier(wine_X_train, wine_y_train, wine_X_test) # SGD Classifier
wine_y_pred_logistic_regression = runLogisticRegression(wine_X_train, wine_y_train, wine_X_test, 3000) # Logistic Regression

### 6. 모델을 평가해보기

In [174]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

print('Decision Tree')
print(classification_report(wine_y_test, wine_y_pred_decision_tree))
print('-----------------------------------------------------------')

print('Random Forest')
print(classification_report(wine_y_test, wine_y_pred_random_forest))
print('-----------------------------------------------------------')

print('SVM')
print(classification_report(wine_y_test, wine_y_pred_svm))
print('-----------------------------------------------------------')

print('SDG Classifier')
print(classification_report(wine_y_test, wine_y_pred_sgd))
print('-----------------------------------------------------------')

print('Logistic Regression')
print(classification_report(wine_y_test, wine_y_pred_logistic_regression))
print('-----------------------------------------------------------')

Decision Tree
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       0.89      1.00      0.94        17
           2       1.00      0.83      0.91        12

    accuracy                           0.94        36
   macro avg       0.96      0.94      0.95        36
weighted avg       0.95      0.94      0.94        36

-----------------------------------------------------------
Random Forest
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       1.00      1.00      1.00        17
           2       1.00      1.00      1.00        12

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36

-----------------------------------------------------------
SVM
              precision    recall  f1-score   support

           0       0.86      0.

##### Confusion Matrix 확인

In [249]:
confusion_matrix(wine_y_test, wine_y_pred_random_forest)

array([[ 7,  0,  0],
       [ 0, 17,  0],
       [ 0,  0, 12]])

```
         C0  C1  C2
    C0 [[ 7,  0,  0],
    C1  [ 0, 17,  0],
    C2  [ 0,  0, 12]]
```

In [258]:
wine_y_train_df = pd.DataFrame(data=wine_y_train)
wine_train_grouped = wine_y_train_df.groupby(0)
print(wine_train_grouped.size())

0
0    52
1    54
2    36
dtype: int64


### 평가지표 선택
    
와인 문제에서 평가지표는 정확도만 보고 판단할 수 없습니다.

와인 문제는 Class0, Class1, Class2 각 와인을 분류하는 문제이고,

특성(feature)이 다를 뿐이지 와인 분류마다 중요도 차이가 있지 않습니다.

>The data is the results of a chemical analysis of wines grown in the same region in Italy by three different cultivators. There are thirteen different measurements taken for different constituents found in the three types of wine. (DESCR에서 확인한 내용)

이러한 와인문제는 잘 분류했는지 못했는지 파악하해야 하며, 이 경우 정확도가 중요합니다.

그러나, 

class_2 데이터가 다른 class_1, class_2에 비해 

50~67% 정도 데이터 갯수가 적어 클래스 간 데이터 불균형이 있는 상태입니다.

따라서, 정확도에 허점이 발생할 수 있으므로 **Precision, Recall, F1-score 등 다양한 평가지표를 살펴보는 것이 중요합니다.**



번외) 굳이 1개 평가지표를 선택한다면 

이처럼 클래스가 불균형한 경우 *F1-score* 를 보는 것이 중요하다 생각합니다.

F1-score은 조화평균으로 지표를 평가하며 잘못 분류된 사례를 더 잘 측정하기 때문입니다.

(관련 링크 참고 : https://medium.com/analytics-vidhya/accuracy-vs-f1-score-6258237beca2)

---

# 유방암 여부를 진단해 봅시다

### 1. 필요한 모듈 import하기

In [52]:
from sklearn.datasets import load_breast_cancer

### 2. 유방암 데이터셋 준비

In [53]:
cancer = load_breast_cancer()

### 3. 데이터 이해하기

In [222]:
cancer_feature = cancer.feature_names # Feature Data

cancer_label = cancer.target # Label Data

cancer_target = cancer.target_names # Target Names

In [218]:
cancer_df = pd.DataFrame(data=cancer.data, columns=cancer_feature)
cancer_df['label'] = cancer_label

In [219]:
cancer_df.groupby('label').head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,label
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0
19,13.54,14.36,87.46,566.3,0.09779,0.08129,0.06664,0.04781,0.1885,0.05766,...,19.26,99.7,711.2,0.144,0.1773,0.239,0.1288,0.2977,0.07259,1
20,13.08,15.71,85.63,520.0,0.1075,0.127,0.04568,0.0311,0.1967,0.06811,...,20.49,96.09,630.5,0.1312,0.2776,0.189,0.07283,0.3184,0.08183,1
21,9.504,12.44,60.34,273.9,0.1024,0.06492,0.02956,0.02076,0.1815,0.06905,...,15.66,65.13,314.9,0.1324,0.1148,0.08867,0.06227,0.245,0.07773,1
37,13.03,18.42,82.61,523.8,0.08983,0.03766,0.02562,0.02923,0.1467,0.05863,...,22.81,84.46,545.9,0.09701,0.04619,0.04833,0.05013,0.1987,0.06169,1
46,8.196,16.84,51.71,201.9,0.086,0.05943,0.01588,0.005917,0.1769,0.06503,...,21.96,57.26,242.2,0.1297,0.1357,0.0688,0.02564,0.3105,0.07409,1


In [226]:
print(f'cancer label names : {cancer_target}')

cancer_label_df = pd.Series(cancer_label)
cancer_label_df.value_counts()

cancer label names : ['malignant' 'benign']


1    357
0    212
dtype: int64

In [212]:
print(cancer.DESCR) # 데이터 Describe

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

### 4. train, test 데이터 분리

In [90]:
cancer_X_train, cancer_X_test, cancer_y_train, cancer_y_test = train_test_split(cancer.data,
                                                                                cancer_label,
                                                                                test_size=0.2,
                                                                                random_state=7)

### 5. 다양한 모델로 학습시켜보기

In [177]:
cancer_y_pred_decision_tree = runDecisionTree(cancer_X_train, cancer_y_train, cancer_X_test) # Decision Tree
cancer_y_pred_random_forest = runRandomForest(cancer_X_train, cancer_y_train, cancer_X_test) # Random Forest
cancer_y_pred_svm = runSVM(cancer_X_train, cancer_y_train, cancer_X_test) # SVM
cancer_y_pred_sgd = runSGDClassifier(cancer_X_train, cancer_y_train, cancer_X_test) # SGD Classifier
cancer_y_pred_logistic_regression = runLogisticRegression(cancer_X_train, cancer_y_train, cancer_X_test, 2000) # Logistic Regression

### 6. 모델을 평가해보기

In [229]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

print('Decision Tree')
print(classification_report(cancer_y_test, cancer_y_pred_decision_tree, target_names=cancer_target))
print('-----------------------------------------------------------')

print('Random Forest')
print(classification_report(cancer_y_test, cancer_y_pred_random_forest, target_names=cancer_target))
print('-----------------------------------------------------------')

print('SVM')
print(classification_report(cancer_y_test, cancer_y_pred_svm, target_names=cancer_target))
print('-----------------------------------------------------------')

print('SDG Classifier')
print(classification_report(cancer_y_test, cancer_y_pred_sgd, target_names=cancer_target))
print('-----------------------------------------------------------')

print('Logistic Regression')
print(classification_report(cancer_y_test, cancer_y_pred_logistic_regression, target_names=cancer_target))
print('-----------------------------------------------------------')

Decision Tree
              precision    recall  f1-score   support

   malignant       0.92      0.82      0.87        40
      benign       0.91      0.96      0.93        74

    accuracy                           0.91       114
   macro avg       0.91      0.89      0.90       114
weighted avg       0.91      0.91      0.91       114

-----------------------------------------------------------
Random Forest
              precision    recall  f1-score   support

   malignant       1.00      1.00      1.00        40
      benign       1.00      1.00      1.00        74

    accuracy                           1.00       114
   macro avg       1.00      1.00      1.00       114
weighted avg       1.00      1.00      1.00       114

-----------------------------------------------------------
SVM
              precision    recall  f1-score   support

   malignant       1.00      0.72      0.84        40
      benign       0.87      1.00      0.93        74

    accuracy                  

In [238]:
# 가장 평가 수치가 높은 Random Forest 기준 confusion 행렬 확인 (+ y 데이터 톺파보기)

confusion = confusion_matrix(cancer_y_test, cancer_y_pred_random_forest)
print(f'confusion_matrix : \n{confusion}\n')

cancer_y_test_df = pd.Series(cancer_y_test)
print(f'y_test 각 label 별 개수 : \n{cancer_y_test_df.value_counts()}\n')

cancer_y_pred_random_forest_df = pd.Series(cancer_y_pred_random_forest)
print(f'y_pred 각 label 별 개수 : \n{cancer_y_pred_random_forest_df.value_counts()}')

confusion_matrix : 
[[40  0]
 [ 0 74]]

y_test 각 label 별 개수 : 
1    74
0    40
dtype: int64

y_pred 각 label 별 개수 : 
1    74
0    40
dtype: int64


##### Label 데이터 - 0, 1 확인
1. cancer.DESC에서는 다음과 같이 작성되어 있었고,
    ```
        :Class Distribution: 212 - Malignant, 357 - Benign
    ```
2. 실제 데이터셋의 라벨 개수는 다음과 같습니다.
    ```
        cancer_label_df = pd.Series(cancer_label)
        cancer_label_df.value_counts()
        # 1    357   -> Benign
        # 0    212   -> Malignant
    ```
3. 따라서, **Malignant[0], Benign[1]**, 배열 순으로 class가 적용됨을 확인함

##### Confusion Matrix 확인
   - Malignant[0] : Positive
   - Benign[1] : Negative
   ```
            0   1
        0 [[40  0]
        1 [ 0 74]]
    ```

### 평가지표 선택
    
유방암 분류에서는 ***Recall*** 평가지표가 중요합니다. 

유방암 분류 데이터를 살펴보면,

- Malignant : 암 (Positive)
- Benign : 암이 아닌 종양 (Negative)

분류로 나뉘는데요.

암 환자에게 암이라고 정확하게 맞추는 것도 중요하지만, 

오진을 하더라도 Malignant을 Benign(**Negative**)이라 잘못 오진(**False**)을 하면

이는 가장 큰 문제를 발생시키기 때문에

유방암 문제에서는 ***FN***가 가장 낮은 지표를 찾아야 합니다.

따라서, FP가 가장 낮을 때 점수가 가장 높은 Recall을 선택했습니다.

---

# 회고

### 1. 어려웠던 점
- 이미지를 이해할 때, 데이터 포인트 하나하나가 내가 보던 그 이미지로 연결이 잘 안되어 이해가 바로 안됐던 것 같습니다.
- 데이터마다 적절한 평가지표를 선정해야하는데, 의사결정 기준을 도출하는 것부터 지표 선정까지 전반적으로 어려웠던 것 같습니다.

### 2. 알아낸 점 혹은 아직 모호한 점

##### 알아낸 점
- 라벨 배열에 입력된 순(인덱스 순서)대로 평가지표에 순서대로 나타난다는 것을 알았습니다.
- 데이터, attributes, classes 등 데이터에 관한 모든 내용들이 평가지표에 영향을 미칠 수 있다는 점을 알았습니다.

##### 모호한 점
- 이미지 픽셀 8x8을 flatten해서 64로 만들고, 이게 attributes라는 점이 잘 와닿지 않았습니다.

### 3. 시도한 것들
- LogisticRegression(solver='lbfgs', max_iter=max_iter)할 때, max_iter관련 오류가 발생했었습니다. 검색한 결과, iter를 다 돌아도 처리가 완료되지 않아 발생하는 문제였고, 조금씩 숫자를 올려보면서 적정 iter 값을 찾아 문제를 해결했습니다.
- 수업 때 배운 train data의 균형/불균형성을 확인하고 싶었습니다. 추가적으로 데이터를 살펴보고자 pandas DataFrame으로 train data의 각 class별 갯수를 groupby하여 데이터 균형/불균형성을 확인해보았습니다. 
- 평가지표를 선택할 때 0과 1이 어떤걸 지칭하는지 헷갈려서 데이터 갯수를 맞춰봤습니다. DESC에 작성된 라벨 갯수와 pandas.Series(cancer_label)의 value_counts()로 각 라벨의 갯수를 비교하고, Confusion Matrix 갯수도 맞춰보면서 Malignant -> 0, Benign -> 1 을 눈으로 직접 확인했습니다.
- 데이터가 불균형한 경우에는 어떤 평가지표를 선택하는 것이 좋은지 리서치했습니다.

### 4. 루브릭 지표를 달성하지 못한 부분과 이유
- 루브릭 지표 모두 달성했습니다.

### 5. 자기 다짐
- 모델을 적용하고 평가지표을 도출하는 과정을 빠르게 해보긴 했지만, 사실 각 모델에 대해 명확히 이해하지 못한건 사실입니다.
- 용어도 다 낯설고 온갖 다 처음 배우는 거라 그런걸테니.. 앞으로 하나씩 배우고 쌓아가면서 여러 모델들의 이해도를 높여야겠다 생각했습니다.