# 유방암 진단 데이터로 분류 맛보기

In [1]:
import pandas as pd

## 분석할 데이터 검토해 보기

In [2]:
data = pd.read_csv('../data/breast-cancer-wisconsin.csv')
data.head()

Unnamed: 0,code,Clump_Thickness,Cell_Size,Cell_Shape,Marginal_Adhesion,Single_Epithelial_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,0
1,1002945,5,4,4,5,7,10,3,2,1,0
2,1015425,3,1,1,1,2,2,3,1,1,0
3,1016277,6,8,8,1,3,4,3,7,1,0
4,1017023,4,1,1,3,2,1,3,1,1,0


In [3]:
data['Class'].value_counts()
# 정상 444명, 유방암 239명

0    444
1    239
Name: Class, dtype: int64

In [4]:
data.shape
# 11개 컬럼, 683개의 케이스 존재

(683, 11)

## 특성과 레이블 나누기

In [5]:
# 무식하지만 정확한 방법, 필요한 컬럼이 분리되어있다면 이 방법을 써야 하지 않을까
X1 = data[['Clump_Thickness', 'Cell_Size', 'Cell_Shape', 'Marginal_Adhesion', 'Single_Epithelial_Cell_Size', 'Bare_Nuclei', 'Bland_Chromatin', 'Normal_Nucleoli', 'Mitoses']]

# 심플
X2 = data[data.columns[1:10]]

# loc는 이름으로 가져와야 함
X3 = data.loc[:, 'Clump_Thickness':'Mitoses']

# iloc는 인덱스로 가져와야 함
X4 = data.iloc[:, 1:-1]

X4

Unnamed: 0,Clump_Thickness,Cell_Size,Cell_Shape,Marginal_Adhesion,Single_Epithelial_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses
0,5,1,1,1,2,1,3,1,1
1,5,4,4,5,7,10,3,2,1
2,3,1,1,1,2,2,3,1,1
3,6,8,8,1,3,4,3,7,1
4,4,1,1,3,2,1,3,1,1
...,...,...,...,...,...,...,...,...,...
678,3,1,1,1,3,2,1,1,1
679,2,1,1,1,2,1,1,1,1
680,5,10,10,3,7,3,8,10,2
681,4,8,6,4,3,4,10,6,1


In [6]:
y = data[['Class']]  # [[]]로 가져와야 DataFrame으로 가져와짐, []로 가져오면 Series로 가져와짐
y2 = data['Class']

y

Unnamed: 0,Class
0,0
1,0
2,0
3,0
4,0
...,...
678,0
679,0
680,1
681,1


In [7]:
y2

0      0
1      0
2      0
3      0
4      0
      ..
678    0
679    0
680    1
681    1
682    1
Name: Class, Length: 683, dtype: int64

---

### 어떤 모듈, 함수가 있는지 기억이 나지 않을 때 확인하는 법

In [8]:
# 기억이 안나면 이런 방법으로 찾아보자.
# 시험에서 help만큼 유용할 것
import sklearn.model_selection as ms
from sklearn.model_selection import train_test_split
print(dir(ms))

['BaseCrossValidator', 'BaseShuffleSplit', 'GridSearchCV', 'GroupKFold', 'GroupShuffleSplit', 'KFold', 'LeaveOneGroupOut', 'LeaveOneOut', 'LeavePGroupsOut', 'LeavePOut', 'ParameterGrid', 'ParameterSampler', 'PredefinedSplit', 'RandomizedSearchCV', 'RepeatedKFold', 'RepeatedStratifiedKFold', 'ShuffleSplit', 'StratifiedGroupKFold', 'StratifiedKFold', 'StratifiedShuffleSplit', 'TimeSeriesSplit', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '_search', '_split', '_validation', 'check_cv', 'cross_val_predict', 'cross_val_score', 'cross_validate', 'learning_curve', 'permutation_test_score', 'train_test_split', 'typing', 'validation_curve']


### help 함수를 이용하여 함수의 사용법을 볼 수 있다

In [9]:
help(train_test_split)

Help on function train_test_split in module sklearn.model_selection._split:

train_test_split(*arrays, test_size=None, train_size=None, random_state=None, shuffle=True, stratify=None)
    Split arrays or matrices into random train and test subsets
    
    Quick utility that wraps input validation and
    ``next(ShuffleSplit().split(X, y))`` and application to input data
    into a single call for splitting (and optionally subsampling) data in a
    oneliner.
    
    Read more in the :ref:`User Guide <cross_validation>`.
    
    Parameters
    ----------
    *arrays : sequence of indexables with same length / shape[0]
        Allowed inputs are lists, numpy arrays, scipy-sparse
        matrices or pandas dataframes.
    
    test_size : float or int, default=None
        If float, should be between 0.0 and 1.0 and represent the proportion
        of the dataset to include in the test split. If int, represents the
        absolute number of test samples. If None, the value is set to t

---

## Train-Test 데이터셋 나누기

In [10]:
from sklearn.model_selection import train_test_split

# 분류 문제라면 stratify 옵션에 y를 꼭 넣어주자
# 이를 통해 y_train과 y_test의 레이블 비율을 맞춰줄 수 있다
# stratify가 없으면 train 혹은 test에만 특정 데이터 편향이 발생할 수 있다
X_train, X_test, y_train, y_test = train_test_split(X1, y, stratify=y, random_state=42)

In [11]:
y_train.mean()

Class    0.349609
dtype: float64

In [12]:
y_test.mean()

Class    0.350877
dtype: float64

## 정규화 해보기
- `MinMaxSclaer`는 매우 다른 스케일의 범위를 0과 1사이로 변환
- `StandardScalar`는 각 특성의 평균을 0, 분산을 1로 변경하여 모든 특성이 같은 크기를 가지게 하며, 특성의 최솟값과 최댓값 크기를 제한하지 않음
- `RobustScaler`는 특성들이 같은 스케일을 갖게 되지만 평균과 분산 대신에 중간값과 사분위값을 사용하며, 이상치에 큰 영향을 받지 않음
- `Nomalizer`는 uclidian의 길이가 1이 되도록 데이터 포인트를 조정하며, 각도가 많이 중요할 때 사용

![이미지](img/MinMax_Standard_Scaler.png)

### MinMax 해보기
- 데이터의 범위를 0~1 사이로 정규화 시켜줌

In [13]:
from sklearn.preprocessing import MinMaxScaler

minmax_scaler = MinMaxScaler()

X_minmax_train = minmax_scaler.fit_transform(X_train)  # 학습 데이터를 기준으로 fit 해줘야 함
X_minmax_test = minmax_scaler.transform(X_test)  # 학습 데이터 기준으로 fit 되었기 때문에 테스트 데이터는 fit 하면 안됨

In [14]:
pd.DataFrame(X_minmax_train).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8
count,512.0,512.0,512.0,512.0,512.0,512.0,512.0,512.0,512.0
mean,0.37283,0.231988,0.242839,0.205078,0.241319,0.28559,0.269314,0.199002,0.067491
std,0.317836,0.334781,0.332112,0.319561,0.242541,0.40489,0.265289,0.331503,0.190373
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.111111,0.0,0.0,0.0,0.111111,0.0,0.111111,0.0,0.0
50%,0.333333,0.0,0.0,0.0,0.111111,0.0,0.222222,0.0,0.0
75%,0.555556,0.361111,0.444444,0.333333,0.333333,0.583333,0.444444,0.222222,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [15]:
pd.DataFrame(X_minmax_test).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8
count,171.0,171.0,171.0,171.0,171.0,171.0,171.0,171.0,171.0
mean,0.411306,0.259909,0.25601,0.198181,0.269006,0.274204,0.278752,0.233918,0.065627
std,0.298847,0.357544,0.3327,0.315307,0.259557,0.405891,0.292578,0.360958,0.199372
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.222222,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0
50%,0.444444,0.0,0.111111,0.0,0.111111,0.0,0.222222,0.0,0.0
75%,0.555556,0.444444,0.444444,0.222222,0.388889,0.444444,0.444444,0.388889,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Standard 해보기
- 평균이 0, 표준편차가 1로 정규화 시켜줌

In [16]:
from sklearn.preprocessing import StandardScaler

standard_scaler = StandardScaler()

X_standard_train = standard_scaler.fit_transform(X_train)  # 학습 데이터를 기준으로 fit 해줘야 함
X_standard_test = standard_scaler.transform(X_test)  # 학습 데이터 기준으로 fit 되었기 때문에 테스트 데이터는 fit 하면 안됨

In [17]:
pd.DataFrame(X_standard_train).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8
count,512.0,512.0,512.0,512.0,512.0,512.0,512.0,512.0,512.0
mean,6.938894e-18,6.938894e-18,-2.775558e-17,-2.775558e-17,-4.8572260000000006e-17,6.938894e-18,-2.0816680000000002e-17,-2.775558e-17,-1.734723e-18
std,1.000978,1.000978,1.000978,1.000978,1.000978,1.000978,1.000978,1.000978,1.000978
min,-1.174173,-0.6936309,-0.7319088,-0.6423777,-0.9959361,-0.7060427,-1.016165,-0.6008881,-0.3548677
25%,-0.8242452,-0.6936309,-0.7319088,-0.6423777,-0.5373756,-0.7060427,-0.5969255,-0.6008881,-0.3548677
50%,-0.1243886,-0.6936309,-0.7319088,-0.6423777,-0.5373756,-0.7060427,-0.1776856,-0.6008881,-0.3548677
75%,0.575468,0.3860715,0.6076347,0.401741,0.3797454,0.7360871,0.6607941,0.07011454,-0.3548677
max,1.975181,2.296314,2.282064,2.489978,3.131108,1.76618,2.756993,2.418624,4.903108


In [18]:
pd.DataFrame(X_standard_test).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8
count,171.0,171.0,171.0,171.0,171.0,171.0,171.0,171.0,171.0
mean,0.121175,0.083483,0.0397,-0.021605,0.114263,-0.028149,0.035612,0.10543,-0.009802
std,0.941174,1.069038,1.002747,0.987654,1.071204,1.003453,1.103943,1.089918,1.048292
min,-1.174173,-0.693631,-0.731909,-0.642378,-0.995936,-0.706043,-1.016165,-0.600888,-0.354868
25%,-0.474317,-0.693631,-0.731909,-0.642378,-0.537376,-0.706043,-1.016165,-0.600888,-0.354868
50%,0.22554,-0.693631,-0.397023,-0.642378,-0.537376,-0.706043,-0.177686,-0.600888,-0.354868
75%,0.575468,0.635234,0.607635,0.053701,0.609026,0.392723,0.660794,0.573367,-0.354868
max,1.975181,2.296314,2.282064,2.489978,3.131108,1.76618,2.756993,2.418624,4.903108


### Robust 해보기

In [19]:
from sklearn.preprocessing import RobustScaler

robust_scaler = RobustScaler()

X_robust_train = robust_scaler.fit_transform(X_train)  # 학습 데이터를 기준으로 fit 해줘야 함
X_robust_test = robust_scaler.transform(X_test)  # 학습 데이터 기준으로 fit 되었기 때문에 테스트 데이터는 fit 하면 안됨

In [20]:
pd.DataFrame(X_robust_train).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8
count,512.0,512.0,512.0,512.0,512.0,512.0,512.0,512.0,512.0
mean,0.088867,0.642428,0.546387,0.615234,0.585938,0.489583,0.141276,0.895508,0.607422
std,0.715131,0.927087,0.747253,0.958682,1.091435,0.694097,0.795867,1.491765,1.713359
min,-0.75,0.0,0.0,0.0,-0.5,0.0,-0.666667,0.0,0.0
25%,-0.5,0.0,0.0,0.0,0.0,0.0,-0.333333,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.5,1.0,1.0,1.0,1.0,1.0,0.666667,1.0,0.0
max,1.5,2.769231,2.25,3.0,4.0,1.714286,2.333333,4.5,9.0


In [21]:
pd.DataFrame(X_robust_test).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8
count,171.0,171.0,171.0,171.0,171.0,171.0,171.0,171.0,171.0
mean,0.175439,0.719748,0.576023,0.594542,0.710526,0.470064,0.169591,1.052632,0.590643
std,0.672405,0.990122,0.748574,0.945921,1.168007,0.695813,0.877734,1.624312,1.794346
min,-0.75,0.0,0.0,0.0,-0.5,0.0,-0.666667,0.0,0.0
25%,-0.25,0.0,0.0,0.0,0.0,0.0,-0.666667,0.0,0.0
50%,0.25,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.5,1.230769,1.0,0.666667,1.25,0.761905,0.666667,1.75,0.0
max,1.5,2.769231,2.25,3.0,4.0,1.714286,2.333333,4.5,9.0


---

## 모델 학습

In [22]:
from sklearn.linear_model import LogisticRegression

In [23]:
model = LogisticRegression()
model.fit(X_robust_train, y_train.values.ravel())  # LogisticRegressio의 Y는 (n_sample,) 타입으로 넘겨야 해서 1d array로 바꿈

LogisticRegression()

In [24]:
y_train.values.ravel()  # y_train.values.reshape(-1)

array([0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,

### 모델 결과 예측해 보기

In [25]:
pred_train = model.predict(X_robust_train)  # train 데이터를 넣어서 정확도가 높을 것임
pred_train

array([0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,

In [26]:
model.score(X_robust_train, y_train)  # train 데이터는 0.97의 정확도를 가짐

0.974609375

In [27]:
model.score(X_robust_test, y_test)  # test 데이터는 0.95의 정확도를 가짐

0.9590643274853801

In [28]:
model.predict_proba(X_robust_train)  # 각 레이블 별 확률을 확인할 수 있음

array([[0.99334327, 0.00665673],
       [0.82797792, 0.17202208],
       [0.98446831, 0.01553169],
       ...,
       [0.05255661, 0.94744339],
       [0.99614915, 0.00385085],
       [0.99739729, 0.00260271]])

---

## 혼동행렬 확인하기

In [29]:
from sklearn.metrics import confusion_matrix

In [30]:
pred_train = model.predict(X_robust_train)
confusion_train = confusion_matrix(y_train, pred_train)  # 정답, 예측 순으로 넣기

confusion_train

array([[327,   6],
       [  7, 172]], dtype=int64)

In [31]:
pred_test = model.predict(X_robust_test)
confusion_train = confusion_matrix(y_test, pred_test)  # 정답, 예측 순으로 넣기

confusion_train

array([[106,   5],
       [  2,  58]], dtype=int64)

## 평가지표 상세 확인

In [32]:
from sklearn.metrics import classification_report

In [33]:
cfreport_train = classification_report(y_train, pred_train)  # 정답, 예측 순으로 넣기
print(cfreport_train)

              precision    recall  f1-score   support

           0       0.98      0.98      0.98       333
           1       0.97      0.96      0.96       179

    accuracy                           0.97       512
   macro avg       0.97      0.97      0.97       512
weighted avg       0.97      0.97      0.97       512



In [34]:
cfreport_test = classification_report(y_test, pred_test)  # 정답, 예측 순으로 넣기
print(cfreport_test)

              precision    recall  f1-score   support

           0       0.98      0.95      0.97       111
           1       0.92      0.97      0.94        60

    accuracy                           0.96       171
   macro avg       0.95      0.96      0.96       171
weighted avg       0.96      0.96      0.96       171



## ROC 확인

In [35]:
from sklearn.metrics import roc_curve, auc
from sklearn import metrics

In [36]:
FP_rate, TP_rate, thresholds = roc_curve(y_test, model.decision_function(X_robust_test))
roc_auc = metrics.roc_auc_score(y_test, model.decision_function(X_robust_test))
roc_auc

0.991891891891892

## 예측값 병합 및 저장

In [37]:
prob_train = model.predict_proba(X_robust_train)  # 레이블 별 확률 가져옴
y_train[['y_pred']] = pred_train.reshape(-1, 1)  # pred_train이 (n,) 이어서 (n, 1)로 바꿔줌
y_train[['y_prob0', 'y_prob1']] = prob_train
y_train

Unnamed: 0,Class,y_pred,y_prob0,y_prob1
131,0,0,0.993343,0.006657
6,0,0,0.827978,0.172022
0,0,0,0.984468,0.015532
269,0,0,0.996735,0.003265
56,1,1,0.130300,0.869700
...,...,...,...,...
515,1,1,0.006942,0.993058
216,1,0,0.932515,0.067485
312,1,1,0.052557,0.947443
11,0,0,0.996149,0.003851


In [38]:
prob_test = model.predict_proba(X_robust_test)  # 레이블 별 확률 가져옴
y_test[['y_pred']] = pred_test.reshape(-1, 1)  # pred_train이 (n,) 이어서 (n, 1)로 바꿔줌
y_test[['y_prob0', 'y_prob1']] = prob_test
y_test

Unnamed: 0,Class,y_pred,y_prob0,y_prob1
541,0,0,0.979514,0.020486
549,0,0,0.987691,0.012309
318,0,0,0.971233,0.028767
183,0,0,0.993579,0.006421
478,1,1,0.000133,0.999867
...,...,...,...,...
425,1,1,0.000685,0.999315
314,1,1,0.033549,0.966451
15,1,1,0.379621,0.620379
510,0,0,0.994205,0.005795


In [39]:
Total_test = pd.concat([X_test, y_test], axis=1)  # axis=0은 케이스를 합침(행이 늘어남), axis=1은 열을 합침(열이 늘어남)
Total_test

Unnamed: 0,Clump_Thickness,Cell_Size,Cell_Shape,Marginal_Adhesion,Single_Epithelial_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses,Class,y_pred,y_prob0,y_prob1
541,5,2,2,2,1,1,2,1,1,0,0,0.979514,0.020486
549,4,1,1,1,2,1,3,2,1,0,0,0.987691,0.012309
318,5,2,2,2,2,1,2,2,1,0,0,0.971233,0.028767
183,1,2,3,1,2,1,3,1,1,0,0,0.993579,0.006421
478,5,10,10,10,6,10,6,5,2,1,1,0.000133,0.999867
...,...,...,...,...,...,...,...,...,...,...,...,...,...
425,10,4,3,10,4,10,10,1,1,1,1,0.000685,0.999315
314,8,10,3,2,6,4,3,10,1,1,1,0.033549,0.966451
15,7,4,6,4,6,1,4,3,1,1,1,0.379621,0.620379
510,3,1,1,2,2,1,1,1,1,0,0,0.994205,0.005795


In [40]:
Total_test.to_csv("../data/classification_test.csv")