In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
breast_cancer = load_breast_cancer()
breast_cancer_data = breast_cancer.data        # 학습지
breast_cancer_label = breast_cancer.target     # 정답지

In [4]:
breast_cancer.target_names    # 악성, 양성

array(['malignant', 'benign'], dtype='<U9')

In [5]:
import pandas as pd
breast_cancer_df = pd.DataFrame(data = breast_cancer_data, columns = breast_cancer.feature_names)
breast_cancer_df.describe()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,...,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,...,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,...,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


In [6]:
# 학습시키기
X_train, X_test, y_train, y_test = train_test_split(breast_cancer_data, breast_cancer_label, test_size=0.2, random_state=36)

In [8]:
# 여러 가지 모델 평가하기

### 1) Decision Tree
from sklearn.tree import DecisionTreeClassifier
decision_tree = DecisionTreeClassifier(random_state=80)
decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_test)

print("* Decision Tree 모델")
print(classification_report(y_test, y_pred))
print("\n")


### 2) Random Forest
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(random_state=80)
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)

print("* Random Forest 모델")
print(classification_report(y_test, y_pred))
print("\n")


### 3) SVM
from sklearn import svm
svm_model = svm.SVC(random_state=80)
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)

print("* SVM 모델")
print(classification_report(y_test, y_pred))
print("\n")


### 4) SGD Classifier
from sklearn.linear_model import SGDClassifier
sgd_model = SGDClassifier(random_state=80)
sgd_model.fit(X_train, y_train)
y_pred = sgd_model.predict(X_test)

print("* SGD Classifier 모델")
print(classification_report(y_test, y_pred))
print("\n")


### 5) Logistic Regression
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression(random_state=80)
logistic_model.fit(X_train, y_train)
y_pred = logistic_model.predict(X_test)

print("* Logistic Regression 모델")
print(classification_report(y_test, y_pred))

* Decision Tree 모델
              precision    recall  f1-score   support

           0       0.91      1.00      0.95        39
           1       1.00      0.95      0.97        75

    accuracy                           0.96       114
   macro avg       0.95      0.97      0.96       114
weighted avg       0.97      0.96      0.97       114



* Random Forest 모델
              precision    recall  f1-score   support

           0       0.95      0.95      0.95        39
           1       0.97      0.97      0.97        75

    accuracy                           0.96       114
   macro avg       0.96      0.96      0.96       114
weighted avg       0.96      0.96      0.96       114



* SVM 모델
              precision    recall  f1-score   support

           0       1.00      0.79      0.89        39
           1       0.90      1.00      0.95        75

    accuracy                           0.93       114
   macro avg       0.95      0.90      0.92       114
weighted avg       0.94

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


유방암 진단의 경우 악성인 환자를 놓치지 않는 것이 중요하기 때문에 양성을 음성으로 잘못 판단하면 안된다.  
따라서 이 경우에는 재현율이 매우 중요하다.  

첫 번째 모델인 Decision Tree의 경우 인덱스 0, 즉 악성(malignant)에 대한 재현율이 100%를 기록하였다.  
따라서 유방암 진단에 적합한 모델이라고 할 수 있다.  

두 번째 모델인 Random Forest는 첫 번째 모델과 정확도(Accuracy)가 96%로 같으나,  
재현율을 보았을 때 악성에 대한 재현율이 95%로 살짝 떨어져서 적합하지 못하다.

그 외 나머지 모델은 전체적으로 위 모델보다 기록이 낮으므로 선택하지 않는게 좋다.