<a href="https://colab.research.google.com/github/snow-white2024/first-repository/blob/master/%EB%8D%B0%EC%9D%B4%ED%84%B0%EB%B6%84%EC%84%9D%20%EA%B8%B0%EC%B4%88%206%EB%B2%88%EB%85%B8%EB%93%9C%20%ED%94%84%EB%A1%9C%EC%A0%9D%ED%8A%B8%20/%EB%8D%B0%EC%9D%B4%ED%84%B0_%EB%B6%84%EC%84%9D_%EA%B8%B0%EC%B4%88_6%EB%B2%88%EB%85%B8%EB%93%9C_%EC%9C%A0%EB%B0%A9%EC%95%94%EB%8D%B0%EC%9D%B4%ED%84%B0%EC%85%8B.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# (1) 필요한 모듈 import하기
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier, LogisticRegression
import pandas as pd

In [None]:
# (2) 데이터 준비
data = load_breast_cancer()

In [None]:
# (3) 데이터 이해하기
# data 지정
X = data.data # Feature
y = data.target # Label

In [None]:
# (4) train, test 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

In [None]:
# Target Names 출력
print("Target Names:", data.target_names)

Target Names: ['malignant' 'benign']


In [None]:
# 데이터 Describe
print(df.describe())

       mean radius  mean texture  mean perimeter    mean area  \
count   569.000000    569.000000      569.000000   569.000000   
mean     14.127292     19.289649       91.969033   654.889104   
std       3.524049      4.301036       24.298981   351.914129   
min       6.981000      9.710000       43.790000   143.500000   
25%      11.700000     16.170000       75.170000   420.300000   
50%      13.370000     18.840000       86.240000   551.100000   
75%      15.780000     21.800000      104.100000   782.700000   
max      28.110000     39.280000      188.500000  2501.000000   

       mean smoothness  mean compactness  mean concavity  mean concave points  \
count       569.000000        569.000000      569.000000           569.000000   
mean          0.096360          0.104341        0.088799             0.048919   
std           0.014064          0.052813        0.079720             0.038803   
min           0.052630          0.019380        0.000000             0.000000   
25%      

In [None]:
# Target 데이터 요약
print(pd.Series(data.target).value_counts())
print(data.target_names)

1    357
0    212
Name: count, dtype: int64
['malignant' 'benign']


In [None]:
# (5) 다양한 모델로 학습시켜보기
# ① Decision Tree 학습
decision_tree = DecisionTreeClassifier(random_state=7)
decision_tree.fit(X_train, y_train)  # 모델 학습
y_pred_tree = decision_tree.predict(X_test)  # 예측
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_tree))  # 평가
print(classification_report(y_test, y_pred_tree, target_names=data.target_names))

# ② Random Forest 학습
random_forest = RandomForestClassifier(random_state=7)
random_forest.fit(X_train, y_train)  # 모델 학습
y_pred_forest = random_forest.predict(X_test)  # 예측
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_forest))  # 평가
print(classification_report(y_test, y_pred_forest, target_names=data.target_names))

# ③ SVM 학습
svm = SVC(random_state=7)
svm.fit(X_train, y_train)  # 모델 학습
y_pred_svm = svm.predict(X_test)  # 예측
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))  # 평가
print(classification_report(y_test, y_pred_svm, target_names=data.target_names))

# ④ SGD Classifier 학습
sgd_classifier = SGDClassifier(random_state=7, max_iter=1000, tol=1e-3)
sgd_classifier.fit(X_train, y_train)  # 모델 학습
y_pred_sgd = sgd_classifier.predict(X_test)  # 예측
print("SGD Classifier Accuracy:", accuracy_score(y_test, y_pred_sgd))  # 평가
print(classification_report(y_test, y_pred_sgd, target_names=data.target_names))

# ⑤ Logistic Regression 학습
logistic_regression = LogisticRegression(random_state=7, max_iter=1600)
logistic_regression.fit(X_train, y_train)  # 모델 학습
y_pred_logistic = logistic_regression.predict(X_test)  # 예측
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_logistic))  # 평가
print(classification_report(y_test, y_pred_logistic, target_names=data.target_names))

Decision Tree Accuracy: 0.9122807017543859
              precision    recall  f1-score   support

   malignant       0.92      0.82      0.87        40
      benign       0.91      0.96      0.93        74

    accuracy                           0.91       114
   macro avg       0.91      0.89      0.90       114
weighted avg       0.91      0.91      0.91       114

Random Forest Accuracy: 0.9736842105263158
              precision    recall  f1-score   support

   malignant       1.00      0.93      0.96        40
      benign       0.96      1.00      0.98        74

    accuracy                           0.97       114
   macro avg       0.98      0.96      0.97       114
weighted avg       0.97      0.97      0.97       114

SVM Accuracy: 0.9035087719298246
              precision    recall  f1-score   support

   malignant       1.00      0.72      0.84        40
      benign       0.87      1.00      0.93        74

    accuracy                           0.90       114
   macro 

In [None]:
# (6) 모델을 평가해 보기
models = {
    "Decision Tree": decision_tree,
    "Random Forest": random_forest,
    "SVM": svm,
    "SGD Classifier": sgd_classifier,
    "Logistic Regression": logistic_regression
}

for model_name, model in models.items():
    # 예측값 생성
    y_pred = model.predict(X_test)

    # 정확도 평가
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy:", accuracy)

#  가장 높은 정확도(0.9474)를 나타낸 Random Forest가 가장 나은 모델일 가능성이 크다고 생각함.

Decision Tree Accuracy: 0.9122807017543859
Random Forest Accuracy: 0.9736842105263158
SVM Accuracy: 0.9035087719298246
SGD Classifier Accuracy: 0.7280701754385965
Logistic Regression Accuracy: 0.9473684210526315


In [None]:
'''
# ⑤ Logistic Regression 학습
logistic_regression = LogisticRegression(random_state=7, max_iter=1000)
logistic_regression.fit(X_train, y_train)  # 모델 학습
y_pred_logistic = logistic_regression.predict(X_test)  # 예측
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_logistic))  # 평가
print(classification_report(y_test, y_pred_logistic, target_names=data.target_names))
'''
# max_iter=1000으로 했을 때
# 1000번까지만.... 좀 더 최적화된 값을 찾으세요
# /usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
# STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

# Increase the number of iterations (max_iter) or scale the data as shown in:
#     https://scikit-learn.org/stable/modules/preprocessing.html
# Please also refer to the documentation for alternative solver options:
#     https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
#   n_iter_i = _check_optimize_result(

'\n# ⑤ Logistic Regression 학습\nlogistic_regression = LogisticRegression(random_state=7, max_iter=1000)\nlogistic_regression.fit(X_train, y_train)  # 모델 학습\ny_pred_logistic = logistic_regression.predict(X_test)  # 예측\nprint("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_logistic))  # 평가\nprint(classification_report(y_test, y_pred_logistic, target_names=data.target_names))\n'