In [1]:
from sklearn.datasets import load_breast_cancer

In [2]:
raw_data = load_breast_cancer()

In [3]:
print(raw_data['DESCR'])

.. _breast_cancer_dataset:

Breast cancer Wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

:Number of Instances: 569

:Number of Attributes: 30 numeric, predictive attributes and the class

:Attribute Information:
    - radius (mean of distances from center to points on the perimeter)
    - texture (standard deviation of gray-scale values)
    - perimeter
    - area
    - smoothness (local variation in radius lengths)
    - compactness (perimeter^2 / area - 1.0)
    - concavity (severity of concave portions of the contour)
    - concave points (number of concave portions of the contour)
    - symmetry
    - fractal dimension ("coastline approximation" - 1)

    The mean, standard error, and "worst" or largest (mean of the three
    worst/largest values) of these features were computed for each image,
    resulting in 30 features.  For instance, field 0 is Mean Radius, field
    10 is Radius SE, field 20 is Worst Radius.

    - 

In [4]:
raw_data['feature_names']

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [5]:
raw_data['data'].shape

(569, 30)

In [6]:
import pandas as pd 
df_X = pd.DataFrame(columns=raw_data['feature_names'], data=raw_data['data'])

In [7]:
target = raw_data['target']

In [8]:
raw_data['target_names']

array(['malignant', 'benign'], dtype='<U9')

In [9]:
df_X.describe()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,...,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,...,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,...,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


In [10]:
df_X.shape

(569, 30)

In [11]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split

1. 데이터셋 2개 생성 (훈련, 테스트)
- MinMax 적용한 데이터 셋 
- StandardScaler 적용한 데이터 셋 
- 훈련 : 80%, 테스트 : 20% 

In [12]:
help(train_test_split)

Help on function train_test_split in module sklearn.model_selection._split:

train_test_split(*arrays, test_size=None, train_size=None, random_state=None, shuffle=True, stratify=None)
    Split arrays or matrices into random train and test subsets.

    Quick utility that wraps input validation,
    ``next(ShuffleSplit().split(X, y))``, and application to input data
    into a single call for splitting (and optionally subsampling) data into a
    one-liner.

    Read more in the :ref:`User Guide <cross_validation>`.

    Parameters
    ----------
    *arrays : sequence of indexables with same length / shape[0]
        Allowed inputs are lists, numpy arrays, scipy-sparse
        matrices or pandas dataframes.

    test_size : float or int, default=None
        If float, should be between 0.0 and 1.0 and represent the proportion
        of the dataset to include in the test split. If int, represents the
        absolute number of test samples. If None, the value is set to the
        com

In [13]:
train_X, test_X, train_y, test_y = train_test_split(df_X, target, test_size=0.2)

In [14]:
ss = StandardScaler()
ss.fit(train_X)
train_X_ss = ss.transform(train_X)
test_X_ss = ss.transform(test_X)

In [15]:
ss2 = StandardScaler()
ss2.fit(train_X)
train_X_ss2 = ss2.transform(train_X)
ss3 = StandardScaler()
ss3.fit(test_X)
test_X_ss3 = ss3.transform(test_X)


In [16]:
pd.DataFrame(test_X_ss3).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
count,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0,...,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0
mean,-8.161113e-16,-1.309868e-15,-1.024522e-15,-6.466562e-16,3.459221e-15,-8.017466e-16,3.364755e-16,-8.99378e-16,-2.460994e-15,1.414074e-15,...,-7.693651000000001e-17,7.265144e-16,-9.884880000000001e-17,-5.745891e-17,3.845608e-16,8.448408000000001e-17,-1.721333e-16,-3.078678e-16,1.743245e-16,-7.45992e-16
std,1.004415,1.004415,1.004415,1.004415,1.004415,1.004415,1.004415,1.004415,1.004415,1.004415,...,1.004415,1.004415,1.004415,1.004415,1.004415,1.004415,1.004415,1.004415,1.004415,1.004415
min,-1.585739,-1.930241,-1.576493,-1.232048,-2.086385,-1.410613,-1.130733,-1.310363,-1.923328,-1.517194,...,-1.43892,-1.570561,-1.460655,-1.08586,-2.17847,-1.381153,-1.33005,-1.86908,-1.842361,-1.359004
25%,-0.7102138,-0.7644745,-0.7183022,-0.6804531,-0.7741592,-0.733296,-0.7615226,-0.6931166,-0.7259447,-0.7362793,...,-0.6766419,-0.8035574,-0.7139936,-0.6567856,-0.6888333,-0.691455,-0.8152353,-0.8258471,-0.6780241,-0.6784838
50%,-0.261788,-0.1601659,-0.2618484,-0.3366588,-0.08759011,-0.1614544,-0.3146263,-0.3633749,-0.07940021,-0.2494242,...,-0.2995913,-0.1670234,-0.3229478,-0.3767248,-0.0694416,-0.2054437,-0.2748479,-0.2145329,-0.1060276,-0.3249853
75%,0.4521624,0.648509,0.4531733,0.3279262,0.7003545,0.5205484,0.4351511,0.5344137,0.5692711,0.5617608,...,0.319236,0.820988,0.4789732,0.2000012,0.6506013,0.3531261,0.5176582,0.6223002,0.524162,0.4313042
max,3.668823,2.517471,3.679616,4.432831,2.164426,3.595593,3.995908,3.670574,3.406411,4.353897,...,3.37804,2.761033,3.276219,3.949614,2.5191,3.492099,4.109857,2.756005,3.590896,3.308303


In [18]:
from sklearn.linear_model import LogisticRegression

In [21]:
# 첫번째 훈련 데이터 standard scaler 사용하여 테스트 데이터도 scaler 경우 
lrg = LogisticRegression()
lrg.fit(train_X_ss, train_y)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [25]:
from sklearn.metrics import accuracy_score

In [24]:
test_y == lrg.predict(test_X_ss)

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True])

In [22]:
lrg.predict(test_X_ss)

array([0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       1, 0, 1, 1])

In [26]:
accuracy_score(lrg.predict(test_X_ss), test_y)

1.0

In [27]:
# 첫번째 훈련 데이터 standard scaler 사용하여 테스트 데이터도 scaler 경우 
lrg2 = LogisticRegression()
lrg2.fit(train_X_ss2, train_y)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [28]:
accuracy_score(lrg.predict(test_X_ss3), test_y)

0.9912280701754386

In [29]:
ss_min = MinMaxScaler()
train_X_ss_minmax = ss_min.fit_transform(train_X)
test_X_ss_minmax = ss_min.transform(test_X)

In [34]:
pd.DataFrame(train_X_ss_minmax).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
count,455.0,455.0,455.0,455.0,455.0,455.0,455.0,455.0,455.0,455.0,...,455.0,455.0,455.0,455.0,455.0,455.0,455.0,455.0,455.0,455.0
mean,0.336983,0.325031,0.331739,0.215843,0.397575,0.261733,0.206998,0.242531,0.381814,0.270602,...,0.295596,0.36631,0.281655,0.169907,0.405252,0.219431,0.214626,0.391503,0.265146,0.188893
std,0.166572,0.143235,0.16788,0.148648,0.132011,0.163957,0.186527,0.1942,0.142896,0.149344,...,0.170838,0.162524,0.166444,0.138792,0.15597,0.153682,0.165197,0.228313,0.125961,0.118961
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.223106,0.225397,0.216986,0.117306,0.304776,0.137262,0.069189,0.097291,0.282071,0.165122,...,0.180719,0.249867,0.168435,0.081351,0.297365,0.112845,0.092212,0.216701,0.184703,0.106454
50%,0.3038,0.310788,0.293345,0.17315,0.396678,0.223299,0.138918,0.165308,0.370202,0.249579,...,0.250445,0.358475,0.23537,0.123722,0.398402,0.178818,0.18107,0.338763,0.248571,0.164305
75%,0.415732,0.401928,0.417456,0.272344,0.476393,0.340991,0.307873,0.368265,0.457576,0.339301,...,0.386695,0.463753,0.372728,0.221392,0.496797,0.306789,0.302476,0.57079,0.318648,0.242883
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [35]:
lrg3 = LogisticRegression()
lrg3.fit(train_X_ss_minmax, train_y)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [36]:
accuracy_score(lrg.predict(test_X_ss_minmax), test_y)

0.3333333333333333