# 유방암 여부 분석
## 1) 필요 모듈 임포트

In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,classification_report

import numpy as np
import pandas as pd

## 2) 데이터 준비

In [2]:
cancer = load_breast_cancer()

## 3) 데이터 이해
- Feature Data 확인
- Label Data 확인
- Target Names 출력
- Data Describe

In [3]:
cancer.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])

### Feature Data 확인

In [4]:
cancer.feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [5]:
cancer_data = cancer.data

In [6]:
cancer_df = pd.DataFrame(cancer_data, columns=cancer.feature_names)
cancer_df

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


- 컬럼 사이의 절대적인 크기 차이가 꽤 있는 편이다.
- 정규화를 적용하기 전과 적용한 후의 결과를 확인해보자.

### Label Data 확인

In [7]:
cancer.target_names

array(['malignant', 'benign'], dtype='<U9')

- 타겟 데이터는 악성종양(`malignant`)과 양성종양(`benign`) 두 종류이다.

In [8]:
cancer.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,

In [9]:
np.unique(cancer.target)

array([0, 1])

### Describe 확인

In [10]:
print(cancer.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

## 4) 데이터 전처리

### train, test 데이터 분리

In [11]:
x_train, x_test, y_train, y_test = train_test_split(cancer_data, cancer.target,
                                                   test_size=0.2,
                                                   random_state=38)
print(f'x_train : {x_train.shape}, y_train : {y_train.shape}\nx_test : {x_test.shape}, y_test : {y_test.shape}')

x_train : (455, 30), y_train : (455,)
x_test : (114, 30), y_test : (114,)


### 정규화

- 사이킷런이 제공하는 `RobustScaler`를 사용한다.
- 학습데이터를 기준으로 스케일러를 `fit()` 한 후 학습 데이터와 테스트 데이터를 스케일링한다.

In [12]:
scaler = RobustScaler()
scaler.fit(x_train)
X_train = scaler.transform(x_train)
X_test = scaler.transform(x_test)
print(X_train)

[[-0.59751973 -0.69441984 -0.58242812 ... -0.67590277 -0.74928775
  -0.59037282]
 [-0.38556933  0.98317095 -0.32332268 ...  0.29108733 -0.03276353
   0.9202454 ]
 [ 1.85118377  0.60053144  1.85015974 ...  1.40542163  1.42735043
   0.96743747]
 ...
 [-0.84148816 -0.13994686 -0.81246006 ... -0.76723017  0.32905983
  -0.0684285 ]
 [ 1.08680947 -0.0336581   1.08019169 ...  0.30809243  0.27777778
  -0.01557338]
 [ 0.4735062   0.38795394  0.52428115 ...  1.1173352   0.25213675
   1.16092496]]


## 5) 베이스 라인 모델
- 정규화하지 않은 원본 학습 데이터로 의사결정나무 모델 학습

In [13]:
base = DecisionTreeClassifier()
base.fit(x_train,y_train)
base_pred = base.predict(x_test)
print(classification_report(y_test, base_pred))

              precision    recall  f1-score   support

           0       0.92      0.94      0.93        36
           1       0.97      0.96      0.97        78

    accuracy                           0.96       114
   macro avg       0.95      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114



- 96%의 정확도를 보인다

## 6) 다양한 모델로 학습
- `Decision Tree`
- `Random Forest`
- `SVM`
- `SGD Classifier`
- `Logistic Regression`

In [14]:
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
svm = SVC()
sgd = SGDClassifier()
logi = LogisticRegression()

dt.fit(X_train,y_train)
dt_pred = dt.predict(X_test)

rf.fit(X_train,y_train)
rf_pred = rf.predict(X_test)

svm.fit(X_train,y_train)
svm_pred = svm.predict(X_test)

sgd.fit(X_train,y_train)
sgd_pred = sgd.predict(X_test)

logi.fit(X_train,y_train)
logi_pred = logi.predict(X_test)

## 7) 모델 평가
- 유방암 여부는 악성 종양의 `재현율`이 더 중요하므로 `재현율`을 주 평가 지표로 하겠다.

In [15]:
model_dict = {'DecisionTree' : dt_pred, 'RandomForest' : rf_pred, 'SVM' : svm_pred, 'SGD': sgd_pred, 'LogiticRegression':logi_pred}
measure=pd.DataFrame(columns=model_dict.keys(),index=['정확도','정밀도','재현율'])
for k, v in model_dict.items():
    accuracy=accuracy_score(y_test,v)
    precisions=precision_score(y_test, v)
    recalls=recall_score(y_test,v)
    measure[k]['정확도']=accuracy
    measure[k]['정밀도']=precisions
    measure[k]['재현율']=recalls
    print(f'==============={k}의 성능===============')
    print('요약')
    print(classification_report(y_test,v))
    print(f'정확도 : {accuracy}')
    print(f'정밀도 : {precisions}')
    print(f'재현율 : {recalls}')

요약
              precision    recall  f1-score   support

           0       0.90      0.97      0.93        36
           1       0.99      0.95      0.97        78

    accuracy                           0.96       114
   macro avg       0.94      0.96      0.95       114
weighted avg       0.96      0.96      0.96       114

정확도 : 0.956140350877193
정밀도 : 0.9866666666666667
재현율 : 0.9487179487179487
요약
              precision    recall  f1-score   support

           0       1.00      0.94      0.97        36
           1       0.97      1.00      0.99        78

    accuracy                           0.98       114
   macro avg       0.99      0.97      0.98       114
weighted avg       0.98      0.98      0.98       114

정확도 : 0.9824561403508771
정밀도 : 0.975
재현율 : 1.0
요약
              precision    recall  f1-score   support

           0       0.94      0.92      0.93        36
           1       0.96      0.97      0.97        78

    accuracy                           0.96       11

### 실제 모델 별 재현율

In [16]:
print('===============총정리================')
print(measure)

    DecisionTree RandomForest       SVM       SGD LogiticRegression
정확도     0.956140     0.982456  0.956140  0.956140          0.964912
정밀도     0.986667     0.975000  0.962025  0.974026          0.962500
재현율     0.948718     1.000000  0.974359  0.961538          0.987179


- 악성종양을 빠짐없이 악성종양이라고 가장 정확하게 분류해낸 모델은 `RandomForeset` 모델이었다. 재현율이 100%였다. 정확도도 98%로 실험한 모델 가운데서는 가장 높은 수치를 보였다.

## 번외편
- 정규화 전과 후 성능 평가

In [17]:
base_dt = DecisionTreeClassifier()
base_rf = RandomForestClassifier()
base_svm = SVC()
base_sgd = SGDClassifier()
base_logi = LogisticRegression()

base_dt.fit(x_train,y_train)
base_dt_pred = base_dt.predict(x_test)

base_rf.fit(x_train,y_train)
base_rf_pred = base_rf.predict(x_test)

base_svm.fit(x_train,y_train)
base_svm_pred = base_svm.predict(x_test)

base_sgd.fit(x_train,y_train)
base_sgd_pred = base_sgd.predict(x_test)

base_logi.fit(x_train,y_train)
base_logi_pred = base_logi.predict(x_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [18]:
model_dict = {'DecisionTree' : base_dt_pred, 'RandomForest' : base_rf_pred, 'SVM' : base_svm_pred, 'SGD': base_sgd_pred, 'LogiticRegression':base_logi_pred}
base_measure=pd.DataFrame(columns=model_dict.keys(),index=['정확도','정밀도','재현율'])
for k, v in model_dict.items():
    accuracy=accuracy_score(y_test,v)
    precisions=precision_score(y_test, v)
    recalls=recall_score(y_test,v)
    base_measure[k]['정확도']=accuracy
    base_measure[k]['정밀도']=precisions
    base_measure[k]['재현율']=recalls
    print(f'==========={k}의 성능===========')
    print('요약')
    print(classification_report(y_test,v))
    print(f'정확도 : {accuracy}')
    print(f'정밀도 : {precisions}')
    print(f'재현율 : {recalls}')

요약
              precision    recall  f1-score   support

           0       0.89      0.92      0.90        36
           1       0.96      0.95      0.95        78

    accuracy                           0.94       114
   macro avg       0.93      0.93      0.93       114
weighted avg       0.94      0.94      0.94       114

정확도 : 0.9385964912280702
정밀도 : 0.961038961038961
재현율 : 0.9487179487179487
요약
              precision    recall  f1-score   support

           0       1.00      0.97      0.99        36
           1       0.99      1.00      0.99        78

    accuracy                           0.99       114
   macro avg       0.99      0.99      0.99       114
weighted avg       0.99      0.99      0.99       114

정확도 : 0.9912280701754386
정밀도 : 0.9873417721518988
재현율 : 1.0
요약
              precision    recall  f1-score   support

           0       1.00      0.78      0.88        36
           1       0.91      1.00      0.95        78

    accuracy                           

In [19]:
print('===========전처리 전============')
print(base_measure)
print('===========전처리 후============')
print(measure)

    DecisionTree RandomForest       SVM       SGD LogiticRegression
정확도     0.938596     0.991228  0.929825  0.771930          0.964912
정밀도     0.961039     0.987342  0.906977  1.000000          0.951220
재현율     0.948718     1.000000  1.000000  0.666667          1.000000
    DecisionTree RandomForest       SVM       SGD LogiticRegression
정확도     0.956140     0.982456  0.956140  0.956140          0.964912
정밀도     0.986667     0.975000  0.962025  0.974026          0.962500
재현율     0.948718     1.000000  0.974359  0.961538          0.987179


- 의외로 전처리 전에서 1.0의 높은 재현율을 보이는 모델이 많았다.
- 궁금해서 다른 스케일러도 사용해보았는데 `SVM` 모델이 `MinMaxScaler`를 사용했을 때 재현율이 꽤 올라갔다.

## 추후 학습 방향
- 지금은 데이터의 분포를 자세히 보지는 않았는데 `pyplot` 등을 적극적으로 활용해 다양한 방법으로 시각화해서 모델 선택이나 스케일러 선택에 있어서 타당한 근거를 설명할 수 있도록 더 공부해야겠다.