In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

In [2]:
# 데이터 준비
from sklearn.datasets import load_breast_cancer

bc_dataset = load_breast_cancer()

X = bc_dataset['data']
y = bc_dataset['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

In [3]:
breast_cancer_df = pd.DataFrame(X, columns=bc_dataset['feature_names'])
breast_cancer_df['label'] = y
breast_cancer_df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,label
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [5]:
breast_cancer_corr = breast_cancer_df.corr()
breast_cancer_corr['label']

mean radius               -0.730029
mean texture              -0.415185
mean perimeter            -0.742636
mean area                 -0.708984
mean smoothness           -0.358560
mean compactness          -0.596534
mean concavity            -0.696360
mean concave points       -0.776614
mean symmetry             -0.330499
mean fractal dimension     0.012838
radius error              -0.567134
texture error              0.008303
perimeter error           -0.556141
area error                -0.548236
smoothness error           0.067016
compactness error         -0.292999
concavity error           -0.253730
concave points error      -0.408042
symmetry error             0.006522
fractal dimension error   -0.077972
worst radius              -0.776454
worst texture             -0.456903
worst perimeter           -0.782914
worst area                -0.733825
worst smoothness          -0.421465
worst compactness         -0.590998
worst concavity           -0.659610
worst concave points      -0

In [6]:
# 속성 선택
from sklearn.feature_selection import SelectPercentile, f_classif
feature_selector = SelectPercentile(score_func=f_classif, percentile=50)
feature_selector.fit(X_train, y_train)
X_selected_train = feature_selector.transform(X_train)
X_selected_test = feature_selector.transform(X_test)

In [9]:
print( X_train.shape, X_selected_train.shape )
feature_selector.get_feature_names_out(bc_dataset['feature_names'])

(426, 30) (426, 15)


array(['mean radius', 'mean perimeter', 'mean area', 'mean compactness',
       'mean concavity', 'mean concave points', 'radius error',
       'perimeter error', 'area error', 'worst radius', 'worst perimeter',
       'worst area', 'worst compactness', 'worst concavity',
       'worst concave points'], dtype=object)

In [10]:
# 모델 훈련
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(max_iter=10000)
logreg.fit(X_selected_train, y_train)
logreg.score(X_selected_train, y_train), logreg.score(X_selected_test, y_test)

(0.9460093896713615, 0.958041958041958)

In [13]:
# 파이프라인 만들기 1
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline, make_pipeline

pipeline = Pipeline([("feature_selector", 
                      SelectPercentile(score_func=f_classif, percentile=50)),
                     ("minmax_scaler", MinMaxScaler()),
                     ('logreg_classifier', LogisticRegression(max_iter=10000))])

pipeline.fit(X_train, y_train)

pipeline.score(X_train ,y_train), pipeline.score(X_test ,y_test)

(0.9436619718309859, 0.9300699300699301)

In [16]:
# 파이프라인 탐색
print( pipeline.steps )
pipeline.steps[0][1].get_feature_names_out(bc_dataset['feature_names'])

[('feature_selector', SelectPercentile(percentile=50)), ('minmax_scaler', MinMaxScaler()), ('logreg_classifier', LogisticRegression(max_iter=10000))]


array(['mean radius', 'mean perimeter', 'mean area', 'mean compactness',
       'mean concavity', 'mean concave points', 'radius error',
       'perimeter error', 'area error', 'worst radius', 'worst perimeter',
       'worst area', 'worst compactness', 'worst concavity',
       'worst concave points'], dtype=object)

In [17]:
from sklearn.ensemble import RandomForestClassifier

pipeline2 = make_pipeline(SelectPercentile(score_func=f_classif, percentile=50),
                          MinMaxScaler(),
                          RandomForestClassifier(n_estimators=1000, max_depth=5))

pipeline2.fit(X_train, y_train)

pipeline2.score(X_train ,y_train), pipeline2.score(X_test ,y_test)

(0.9929577464788732, 0.958041958041958)

In [18]:
pipeline2.steps

[('selectpercentile', SelectPercentile(percentile=50)),
 ('minmaxscaler', MinMaxScaler()),
 ('randomforestclassifier',
  RandomForestClassifier(max_depth=5, n_estimators=1000))]

In [None]:
_