# Sklearn

## sklearn.model_selection

документация: http://scikit-learn.org/stable/modules/cross_validation.html

In [1]:
from sklearn import model_selection, datasets

import numpy as np

### Разовое разбиение данных на обучение и тест с помощью train_test_split

In [2]:
iris = datasets.load_iris()

In [3]:
train_data, test_data, train_labels, test_labels = model_selection.train_test_split(iris.data, iris.target, 
                                                                                     test_size = 0.3)

In [4]:
#убедимся, что тестовая выборка действительно составляет 0.3 от всех данных
float(len(test_labels))/len(iris.data)

0.3

In [5]:
print 'Размер обучающей выборки: {} объектов \nРазмер тестовой выборки: {} объектов'.format(len(train_data),
                                                                                            len(test_data))

Размер обучающей выборки: 105 объектов 
Размер тестовой выборки: 45 объектов


In [6]:
print 'Обучающая выборка:\n', train_data[:5]
print '\n'
print 'Тестовая выборка:\n', test_data[:5]

Обучающая выборка:
[[7.6 3.  6.6 2.1]
 [6.7 3.1 5.6 2.4]
 [4.8 3.4 1.6 0.2]
 [4.9 3.1 1.5 0.1]
 [6.7 3.1 4.7 1.5]]


Тестовая выборка:
[[6.5 3.  5.2 2. ]
 [5.1 3.3 1.7 0.5]
 [4.9 2.5 4.5 1.7]
 [6.6 2.9 4.6 1.3]
 [5.5 2.5 4.  1.3]]


In [7]:
print 'Метки классов на обучающей выборке:\n', train_labels
print '\n'
print 'Метки классов на тестовой выборке:\n', test_labels

Метки классов на обучающей выборке:
[2 2 0 0 1 0 1 0 1 2 2 1 1 0 1 2 2 0 1 0 2 1 2 1 2 0 2 0 2 1 1 1 2 2 2 0 1
 1 0 0 0 1 1 2 2 2 2 1 2 1 0 2 2 1 1 1 0 2 2 1 1 2 0 1 0 0 2 0 2 1 2 1 0 0
 2 2 0 1 2 2 0 1 2 0 0 0 1 0 1 2 1 0 2 1 2 2 0 1 1 2 0 1 2 0 0]


Метки классов на тестовой выборке:
[2 0 2 1 1 2 2 1 2 1 1 2 0 2 2 0 1 0 1 1 1 2 1 1 0 0 0 0 1 0 0 1 1 0 0 2 0
 0 2 0 0 1 2 0 0]


### Стратегии проведения кросс-валидации

In [8]:
#сгенерируем короткое подобие датасета, где элементы совпадают с порядковым номером
X = range(0,10)

#### KFold

In [9]:
kf = model_selection.KFold(n_splits = 5)
for train_indices, test_indices in kf.split(X):
    print train_indices, test_indices

[2 3 4 5 6 7 8 9] [0 1]
[0 1 4 5 6 7 8 9] [2 3]
[0 1 2 3 6 7 8 9] [4 5]
[0 1 2 3 4 5 8 9] [6 7]
[0 1 2 3 4 5 6 7] [8 9]


In [10]:
kf = model_selection.KFold(n_splits = 2, shuffle = True)
for train_indices, test_indices in kf.split(X):
    print train_indices, test_indices

[0 2 3 5 7] [1 4 6 8 9]
[1 4 6 8 9] [0 2 3 5 7]


In [11]:
kf = model_selection.KFold(n_splits = 2, shuffle = True, random_state = 1)
for train_indices, test_indices in kf.split(X):
    print train_indices, test_indices

[1 3 5 7 8] [0 2 4 6 9]
[0 2 4 6 9] [1 3 5 7 8]


#### StratifiedKFold

In [12]:
y = np.array([0] * 5 + [1] * 5)
print y

skf = model_selection.StratifiedKFold(n_splits = 2, shuffle = True, random_state = 0)
for train_indices, test_indices in skf.split(X, y):
    print train_indices, test_indices

[0 0 0 0 0 1 1 1 1 1]
[3 4 8 9] [0 1 2 5 6 7]
[0 1 2 5 6 7] [3 4 8 9]


In [13]:
target = np.array([0, 1] * 5)
print target

skf = model_selection.StratifiedKFold(n_splits = 2,shuffle = True)
for train_indices, test_indices in skf.split(X, target):
    print train_indices, test_indices

[0 1 0 1 0 1 0 1 0 1]
[2 5 6 7] [0 1 3 4 8 9]
[0 1 3 4 8 9] [2 5 6 7]


#### ShuffleSplit

In [14]:
ss = model_selection.ShuffleSplit(n_splits = 10, test_size = 0.2)

for train_indices, test_indices in ss.split(X):
    print train_indices, test_indices

[8 6 4 3 1 0 2 7] [5 9]
[0 3 7 8 4 2 5 1] [6 9]
[6 8 2 0 1 5 7 4] [3 9]
[7 5 0 2 6 3 9 8] [4 1]
[6 0 2 8 5 1 7 4] [3 9]
[4 0 3 8 2 5 1 7] [9 6]
[3 2 9 1 7 4 0 6] [8 5]
[1 9 8 3 6 5 4 2] [7 0]
[9 2 1 7 5 0 6 4] [3 8]
[9 7 2 3 0 8 5 6] [4 1]


#### StratifiedShuffleSplit

In [15]:
target = np.array([0] * 5 + [1] * 5)
print target

sss = model_selection.StratifiedShuffleSplit(n_splits = 4, test_size = 0.2)
for train_indices, test_indices in sss.split(X, target):
    print train_indices, test_indices

[0 0 0 0 0 1 1 1 1 1]
[1 6 8 4 5 7 2 3] [9 0]
[8 4 1 9 6 0 7 3] [2 5]
[7 4 2 8 6 9 3 1] [0 5]
[5 6 9 3 2 1 4 8] [0 7]


#### Leave-One-Out

In [16]:
loo = model_selection.LeaveOneOut()

for train_indices, test_index in loo.split(X):
    print train_indices, test_index

[1 2 3 4 5 6 7 8 9] [0]
[0 2 3 4 5 6 7 8 9] [1]
[0 1 3 4 5 6 7 8 9] [2]
[0 1 2 4 5 6 7 8 9] [3]
[0 1 2 3 5 6 7 8 9] [4]
[0 1 2 3 4 6 7 8 9] [5]
[0 1 2 3 4 5 7 8 9] [6]
[0 1 2 3 4 5 6 8 9] [7]
[0 1 2 3 4 5 6 7 9] [8]
[0 1 2 3 4 5 6 7 8] [9]


Больше стратегий проведения кросс-валидации доступно здесь: http://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators