# 単純に上下半々はダメなことも
前回、breast-cancer-wisconsinのデータでは上半分を訓練用データに、下半分をテスト用データに使いました。
しかし、このような単純な分け方ではダメな場合もあります。

# データを入手する
UCI Machine Learning Repository から iris.data をダウンロードし、作業フォルダーに保存してください。
https://archive.ics.uci.edu/ml/machine-learning-databases/iris/

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('iris.data', header=None, names=['sepal_length', 'sepal_width',
                                                  'petal_length', 'petal_width', 'class'])

In [3]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [4]:
X = df.iloc[:,:-1]

In [5]:
X.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [6]:
y = df.iloc[:,-1].replace({'Iris-setosa':0, 'Iris-versicolor':1, 'Iris-virginica':2})

In [7]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: class, dtype: int64

In [8]:
n_samples = len(X)
n_train = n_samples // 2
n_test = n_samples - n_train

In [9]:
train_index = range(0, n_train)
test_index = range(n_train, n_samples)

In [10]:
X_train, X_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]

In [11]:
from sklearn import linear_model

In [12]:
clf = linear_model.LogisticRegression(C=100000000.0, penalty='l2')
clf.fit(X_train, y_train)

LogisticRegression(C=100000000.0)

In [13]:
print(clf.score(X_test, y_test))

0.3333333333333333


In [14]:
wrong = 0
for i,j in zip(clf.predict(X_test), y_test):
    if i == j:
        print(i,j)
    else:
        print(i, j, ' wrong!')
        wrong += 1

1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 2  wrong!
1 2  wrong!
1 2  wrong!
1 2  wrong!
1 2  wrong!
1 2  wrong!
1 2  wrong!
1 2  wrong!
1 2  wrong!
1 2  wrong!
1 2  wrong!
1 2  wrong!
1 2  wrong!
1 2  wrong!
1 2  wrong!
1 2  wrong!
1 2  wrong!
1 2  wrong!
1 2  wrong!
1 2  wrong!
1 2  wrong!
1 2  wrong!
1 2  wrong!
1 2  wrong!
1 2  wrong!
1 2  wrong!
1 2  wrong!
1 2  wrong!
1 2  wrong!
1 2  wrong!
1 2  wrong!
1 2  wrong!
1 2  wrong!
1 2  wrong!
1 2  wrong!
1 2  wrong!
1 2  wrong!
1 2  wrong!
1 2  wrong!
1 2  wrong!
1 2  wrong!
1 2  wrong!
1 2  wrong!
1 2  wrong!
1 2  wrong!
1 2  wrong!
1 2  wrong!
1 2  wrong!
1 2  wrong!
1 2  wrong!


In [15]:
print('{0} / {1} = {2}'.format(wrong, n_test, 1 - wrong / n_test))

50 / 75 = 0.33333333333333337


## なぜこうなった？

In [16]:
y_train

0     0
1     0
2     0
3     0
4     0
     ..
70    1
71    1
72    1
73    1
74    1
Name: class, Length: 75, dtype: int64

In [17]:
y_test

75     1
76     1
77     1
78     1
79     1
      ..
145    2
146    2
147    2
148    2
149    2
Name: class, Length: 75, dtype: int64

## 対策

In [18]:
from sklearn.model_selection import ShuffleSplit

In [19]:
ss = ShuffleSplit(n_splits=1, train_size=0.5, test_size=0.5, random_state=0)

In [20]:
train_index, test_index = next(ss.split(X))

In [21]:
train_index, test_index

(array([  3, 149,  98,   6,  68, 109,  96,  12, 102, 120, 104, 128,  46,
         11, 110, 124,  41, 148,   1, 113, 139,  42,   4, 129,  17,  38,
          5,  53, 143, 105,   0,  34,  28,  55,  75,  35,  23,  74,  31,
        118,  57, 131,  65,  32, 138,  14, 122,  19,  29, 130,  49, 136,
         99,  82,  79, 115, 145,  72,  77,  25,  81, 140, 142,  39,  58,
         88,  70,  87,  36,  21,   9, 103,  67, 117,  47]),
 array([114,  62,  33, 107,   7, 100,  40,  86,  76,  71, 134,  51,  73,
         54,  63,  37,  78,  90,  45,  16, 121,  66,  24,   8, 126,  22,
         44,  97,  93,  26, 137,  84,  27, 127, 132,  59,  18,  83,  61,
         92, 112,   2, 141,  43,  10,  60, 116, 144, 119, 108,  69, 135,
         56,  80, 123, 133, 106, 146,  50, 147,  85,  30, 101,  94,  64,
         89,  91, 125,  48,  13, 111,  95,  20,  15,  52]))

In [22]:
X_train, X_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]

In [23]:
y_train

3      0
149    2
98     1
6      0
68     1
      ..
9      0
103    2
67     1
117    2
47     0
Name: class, Length: 75, dtype: int64

In [24]:
y_test

114    2
62     1
33     0
107    2
7      0
      ..
111    2
95     1
20     0
15     0
52     1
Name: class, Length: 75, dtype: int64

In [25]:
clf.fit(X_train, y_train)

LogisticRegression(C=100000000.0)

In [26]:
print(clf.score(X_test, y_test))

0.9733333333333334


In [27]:
wrong = 0
for i,j in zip(clf.predict(X_test), y_test):
    if i == j:
        print(i,j)
    else:
        print(i, j, ' wrong!')
        wrong += 1

2 2
1 1
0 0
2 2
0 0
2 2
0 0
1 1
1 1
1 1
2 2
1 1
1 1
1 1
1 1
0 0
1 1
1 1
0 0
0 0
2 2
1 1
0 0
0 0
1 2  wrong!
0 0
0 0
1 1
1 1
0 0
2 2
1 1
0 0
2 2
2 2
1 1
0 0
2 1  wrong!
1 1
1 1
2 2
0 0
2 2
0 0
0 0
1 1
2 2
2 2
2 2
2 2
1 1
2 2
1 1
1 1
2 2
2 2
2 2
2 2
1 1
2 2
1 1
0 0
2 2
1 1
1 1
1 1
1 1
2 2
0 0
0 0
2 2
1 1
0 0
0 0
1 1


In [28]:
print('{0} / {1} = {2}'.format(wrong, n_test, 1 - wrong / n_test))

2 / 75 = 0.9733333333333334


## 偶然よいスコアがでたんじゃない？

In [29]:
ss = ShuffleSplit(n_splits=10, train_size=0.5, test_size=0.5, random_state=0)

In [30]:
scores = []
for train_index, test_index in ss.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    scores.append(score)


In [31]:
scores

[0.9733333333333334,
 0.9466666666666667,
 1.0,
 0.92,
 0.9466666666666667,
 0.8533333333333334,
 0.9733333333333334,
 1.0,
 0.96,
 0.9866666666666667]