# 層化（Stratified）して抽出するほうがよい

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('wdbc.data', header=None,
                 names=['id','diagnosis',
                        'radius_mean','texture_mean','perimeter_mean','area_mean',
                        'smoothness_mean','compactness_mean','concavity_mean',
                        'concave points_meen','symmetry_mean','fractal_dimension_mean',
                        'radius_se','texture_se','perimeter_se','area_se','smoothness_se',
                        'compactness_se','concavity_se','concave points_se','symmetry_se',
                        'fractal_dimension_se','radius_worst','texture_worst','perimeter_worst',
                        'area_worst','smoothness_worst','compactness_worst','concavity_worst',
                        'concave points_worst','symmetry_worst','fractal_dimension_worst'])

In [3]:
X = df.iloc[:,2:]

In [4]:
y = df.iloc[:,1].replace({'M':0, 'B':1})

In [5]:
from sklearn import linear_model

In [39]:
clf = linear_model.LogisticRegression(max_iter=10000)

## トレーニングデータとテストデータを均等に抽出

In [40]:
from sklearn.model_selection import ShuffleSplit

In [41]:
ss = ShuffleSplit(n_splits=10, train_size=0.5, test_size=0.5, random_state=3)

In [42]:
for train_index, test_index in ss.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    clf.fit(X_train, y_train)
    print(clf.score(X_test, y_test))

0.9543859649122807
0.9578947368421052
0.9473684210526315
0.9438596491228071
0.9719298245614035
0.9508771929824561
0.9578947368421052
0.9368421052631579
0.968421052631579
0.9508771929824561


### データの構成比はまあまあ揃っている。

In [43]:
y.value_counts()/len(y)

1    0.627417
0    0.372583
Name: diagnosis, dtype: float64

In [44]:
y_train.value_counts()/len(y_train)

1    0.602113
0    0.397887
Name: diagnosis, dtype: float64

In [45]:
y_test.value_counts()/len(y_test)

1    0.652632
0    0.347368
Name: diagnosis, dtype: float64

## トレーニングデータとテストデータを不均等に抽出

In [46]:
import numpy as np #平均を求めるため

In [47]:
ss = ShuffleSplit(n_splits=10, train_size=0.95, test_size=0.05, random_state=3)

In [48]:
scores = []
for train_index, test_index in ss.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    print(score)
    scores.append(score)
print('平均', np.mean(scores))

0.9655172413793104
0.896551724137931
0.9655172413793104
0.9310344827586207
0.9655172413793104
1.0
1.0
0.9655172413793104
0.9310344827586207
1.0
平均 0.9620689655172414


### データの構成比率はあまり揃っていない。

In [49]:
y.value_counts()/len(y)

1    0.627417
0    0.372583
Name: diagnosis, dtype: float64

In [50]:
y_train.value_counts()/len(y_train)

1    0.622222
0    0.377778
Name: diagnosis, dtype: float64

In [51]:
y_test.value_counts()/len(y_test)

1    0.724138
0    0.275862
Name: diagnosis, dtype: float64

## トレーニングデータとテストデータを層化（Stratified）抽出

In [52]:
from sklearn.model_selection import StratifiedShuffleSplit

In [53]:
ss = StratifiedShuffleSplit(n_splits=10, train_size=0.95, test_size=0.05, random_state=3)

In [54]:
scores = []
for train_index, test_index in ss.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    print(score)
    scores.append(score)
print('平均', np.mean(scores))

0.9310344827586207
0.896551724137931
0.9655172413793104
0.9655172413793104
1.0
1.0
0.9310344827586207
0.9655172413793104
1.0
1.0
平均 0.9655172413793103


### データの構成比がけっこう揃っている。

In [55]:
y.value_counts()/len(y)

1    0.627417
0    0.372583
Name: diagnosis, dtype: float64

In [56]:
y_train.value_counts()/len(y_train)

1    0.627778
0    0.372222
Name: diagnosis, dtype: float64

In [57]:
y_test.value_counts()/len(y_test)

1    0.62069
0    0.37931
Name: diagnosis, dtype: float64