## 유방암 여부 로지스틱 회귀 예제

#### 라이브러리 및 패키지 Import

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


#### 데이터셋 불러오기

In [2]:
breast_cancer = load_breast_cancer()

In [3]:
# 로드한 전체 데이터에 key 값을 출력
print(breast_cancer.keys())
# 전체 데이터 중 data에 대한 전체 행, 열 길이를 출력
print(breast_cancer.data.shape)
# 데이터 컬럼 이름을 출력 
print(breast_cancer.feature_names)

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])
(569, 30)
['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']


In [4]:
print(breast_cancer.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

:Number of Instances: 569

:Number of Attributes: 30 numeric, predictive attributes and the class

:Attribute Information:
    - radius (mean of distances from center to points on the perimeter)
    - texture (standard deviation of gray-scale values)
    - perimeter
    - area
    - smoothness (local variation in radius lengths)
    - compactness (perimeter^2 / area - 1.0)
    - concavity (severity of concave portions of the contour)
    - concave points (number of concave portions of the contour)
    - symmetry
    - fractal dimension ("coastline approximation" - 1)

    The mean, standard error, and "worst" or largest (mean of the three
    worst/largest values) of these features were computed for each image,
    resulting in 30 features.  For instance, field 0 is Mean Radius, field
    10 is Radius SE, field 20 is Worst Radius.

    - 

#### 데이터 전처리

In [5]:
df = pd.DataFrame(data = breast_cancer.data, columns = breast_cancer.feature_names)
df = df.iloc[:, :10]

In [6]:
df["label"] = breast_cancer.target
df.columns = [ col.replace(" ", "_") for col in df.columns]
df.head()

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,mean_symmetry,mean_fractal_dimension,label
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0


In [7]:
# Step1) train / test 으로 나누기
train, test = train_test_split(df, test_size=0.15, random_state=1)

# Step2) train을 다시 train/validation 으로 나누기
train, val = train_test_split(train, test_size = 0.18, random_state=1)

# 확인
train.shape, val.shape, test.shape

((396, 11), (87, 11), (86, 11))

In [8]:
# feature/taraget 설정
feature = train.columns[:-1]
target = "label"

# train 데이터셋
X_train = train[feature]
y_train = train[target]

# validation 데이터셋
X_val = val[feature]
y_val = val[target]

# test 데이터셋
X_test = test[feature]
y_test = test[target]

# 확인
print("feature Matrix: ", X_train.shape, X_val.shape, X_test.shape)
print("target vector: ", y_train.shape, y_val.shape, y_test.shape)

feature Matrix:  (396, 10) (87, 10) (86, 10)
target vector:  (396,) (87,) (86,)


In [9]:
# 스케일러 생성
scaler = StandardScaler()

# 스케일 조정
X_train_sclaed = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# 일부만 확인
X_train_sclaed.T[0].mean(), X_train_sclaed.std()

(-2.444733528972567e-16, 1.0)

#### 로지스틱 회귀 모델 학습

In [10]:
# 모델 생성 및 학습 시키기
logistic = LogisticRegression()
logistic.fit(X_train_sclaed, y_train)

In [11]:
# Validation set 결과 확인
print("validation 데이터셋 정확도")
logistic.score(X_val_scaled, y_val)

validation 데이터셋 정확도


0.9310344827586207

In [12]:
# 각 Feature의 계수 확인(영향도 분석)
logistic.coef_

array([[-0.94054808, -1.31667477, -0.81743154, -1.03033924, -0.90780749,
         0.08604782, -1.17240721, -1.42903504, -0.40733798,  0.4668457 ]])

#### Test set 결과 확인

In [13]:
X_test_scaled = scaler.transform(X_test)

# 결과 확인
print("test 데이터셋 정확도")
logistic.score(X_test_scaled, y_test)

test 데이터셋 정확도


0.9534883720930233