# Sklearn

## sklearn.model_selection

документация: http://scikit-learn.org/stable/modules/cross_validation.html

In [9]:
from sklearn import model_selection, datasets

import numpy as np

### Разовое разбиение данных на обучение и тест с помощью train_test_split

In [10]:
iris = datasets.load_iris()

In [11]:
train_data, test_data, train_labels, test_labels = model_selection.train_test_split(iris.data, iris.target, 
                                                                                     test_size = 0.3)

In [12]:
len(test_labels) / len(iris.data)

0.3

In [28]:
print(f"Traing sample size: {len(train_data)}")
print(f"Testing sample size: {len(test_data)}")

Traing sample size: 105
Testing sample size: 45


In [30]:
print(f"Traing sample size {train_data[:5]}")
print(f"Testing sample size {test_data[:5]}")

Traing sample size [[5.5 2.4 3.7 1. ]
 [6.9 3.1 5.4 2.1]
 [7.4 2.8 6.1 1.9]
 [6.8 2.8 4.8 1.4]
 [5.  3.2 1.2 0.2]]
Testing sample size [[6.  2.7 5.1 1.6]
 [5.5 2.6 4.4 1.2]
 [4.6 3.4 1.4 0.3]
 [6.3 3.3 4.7 1.6]
 [6.9 3.2 5.7 2.3]]


In [31]:
print(f"Training labels {train_labels}")
print(f"Testing labels {test_labels}")

Training labels [1 2 2 1 0 1 1 1 2 0 0 1 1 2 2 1 0 0 1 2 0 0 2 1 1 0 2 0 0 2 0 2 0 0 1 1 2
 2 0 2 0 0 0 1 0 2 2 0 2 2 1 1 2 1 2 2 1 0 1 2 0 2 2 0 0 2 2 2 0 1 1 1 0 2
 2 1 2 0 0 1 2 2 2 2 0 1 1 0 2 1 1 2 1 1 0 2 0 1 2 1 2 1 1 0 0]
Testing labels [1 1 0 1 2 0 2 2 0 0 2 0 2 1 0 0 2 1 1 0 1 2 0 1 0 0 1 2 1 0 1 1 2 2 0 1 1
 0 0 2 1 2 0 0 1]


### Стратегии проведения кросс-валидации

In [16]:
#сгенерируем короткое подобие датасета, где элементы совпадают с порядковым номером
X = range(10)

#### KFold

In [17]:
kf = model_selection.KFold(n_splits = 5)
for train_indices, test_indices in kf.split(X):
    print(train_indices, test_indices)

[2 3 4 5 6 7 8 9] [0 1]
[0 1 4 5 6 7 8 9] [2 3]
[0 1 2 3 6 7 8 9] [4 5]
[0 1 2 3 4 5 8 9] [6 7]
[0 1 2 3 4 5 6 7] [8 9]


In [18]:
kf = model_selection.KFold(n_splits = 2, shuffle = True)
for train_indices, test_indices in kf.split(X):
    print(train_indices, test_indices)

[0 3 4 5 9] [1 2 6 7 8]
[1 2 6 7 8] [0 3 4 5 9]


In [19]:
kf = model_selection.KFold(n_splits = 2, shuffle = True, random_state = 1)
for train_indices, test_indices in kf.split(X):
    print(train_indices, test_indices)

[1 3 5 7 8] [0 2 4 6 9]
[0 2 4 6 9] [1 3 5 7 8]


#### StratifiedKFold

In [20]:
y = np.array([0] * 5 + [1] * 5)
print(y)

skf = model_selection.StratifiedKFold(n_splits = 2, shuffle = True, random_state = 0)
for train_indices, test_indices in skf.split(X, y):
    print(train_indices, test_indices)

[0 0 0 0 0 1 1 1 1 1]
[3 4 6 8 9] [0 1 2 5 7]
[0 1 2 5 7] [3 4 6 8 9]


In [21]:
target = np.array([0, 1] * 5)
print(target)

skf = model_selection.StratifiedKFold(n_splits = 2,shuffle = True)
for train_indices, test_indices in skf.split(X, target):
    print(train_indices, test_indices)

[0 1 0 1 0 1 0 1 0 1]
[0 1 6 7 9] [2 3 4 5 8]
[2 3 4 5 8] [0 1 6 7 9]


#### ShuffleSplit

In [22]:
ss = model_selection.ShuffleSplit(n_splits = 10, test_size = 0.2)

for train_indices, test_indices in ss.split(X):
    print(train_indices, test_indices)

[4 0 6 7 3 8 1 5] [9 2]
[6 5 2 3 9 1 4 8] [7 0]
[1 2 4 8 0 3 9 7] [5 6]
[3 9 8 1 2 0 5 4] [7 6]
[7 8 0 3 5 9 2 6] [1 4]
[4 6 2 8 3 7 0 5] [1 9]
[8 2 4 1 3 0 5 7] [9 6]
[0 4 8 1 9 7 5 2] [6 3]
[1 8 3 6 4 0 2 5] [9 7]
[6 9 3 0 4 2 8 1] [5 7]


#### StratifiedShuffleSplit

In [23]:
target = np.array([0] * 5 + [1] * 5)
print(target)

sss = model_selection.StratifiedShuffleSplit(n_splits = 4, test_size = 0.2)
for train_indices, test_indices in sss.split(X, target):
    print(train_indices, test_indices)

[0 0 0 0 0 1 1 1 1 1]
[6 3 4 5 8 1 9 2] [0 7]
[3 1 6 7 9 0 2 8] [4 5]
[2 5 4 8 6 0 3 9] [1 7]
[5 6 7 4 0 1 2 8] [9 3]


#### Leave-One-Out

In [24]:
loo = model_selection.LeaveOneOut()

for train_indices, test_index in loo.split(X):
    print(train_indices, test_index)

[1 2 3 4 5 6 7 8 9] [0]
[0 2 3 4 5 6 7 8 9] [1]
[0 1 3 4 5 6 7 8 9] [2]
[0 1 2 4 5 6 7 8 9] [3]
[0 1 2 3 5 6 7 8 9] [4]
[0 1 2 3 4 6 7 8 9] [5]
[0 1 2 3 4 5 7 8 9] [6]
[0 1 2 3 4 5 6 8 9] [7]
[0 1 2 3 4 5 6 7 9] [8]
[0 1 2 3 4 5 6 7 8] [9]


Больше стратегий проведения кросс-валидации доступно здесь: http://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators