In [1]:
from msdlib import msd
import numpy as np
import pandas as pd

### Creating data set for splitting that into Train, Validation and Test

In [2]:
seed = 1216

In [3]:
X = np.random.RandomState(seed = seed).randint(100, size = (100, 2, 5))

In [4]:
X

array([[[93, 32, 12, 97, 28],
        [88, 99, 36, 28, 52]],

       [[58, 63, 56, 85,  5],
        [92, 87, 18, 70, 46]],

       [[30,  8, 36, 85, 54],
        [64, 26, 81, 83,  7]],

       [[ 5, 13, 53, 65, 72],
        [50,  3, 71, 70, 19]],

       [[12, 67, 90, 75, 46],
        [51, 40, 98, 39,  6]],

       [[99, 39, 67, 41, 81],
        [85,  6, 22,  4, 12]],

       [[99, 33, 38, 36, 51],
        [96, 51, 88, 67,  4]],

       [[83, 42, 45, 43, 34],
        [73, 96, 89,  7, 53]],

       [[42, 63, 80, 43,  8],
        [34, 41, 97, 74, 50]],

       [[17, 98, 39, 23, 41],
        [55, 78, 48, 77, 75]],

       [[76, 50, 75,  1,  2],
        [45, 68, 52, 33, 53]],

       [[81, 54, 81, 43, 69],
        [45,  8, 88, 32, 51]],

       [[55, 44, 37,  8, 95],
        [64, 23, 87,  6, 53]],

       [[27, 63, 74, 76, 76],
        [16, 43, 73, 21,  5]],

       [[70, 55, 90, 30, 89],
        [32, 51, 90, 14, 73]],

       [[68, 83, 44, 31, 67],
        [51, 50, 32,  2, 43]],

       [

In [5]:
X.shape

(100, 2, 5)

### Creating labels for the data set

In [6]:
y = np.random.RandomState(seed = seed).randint(2, size = X.shape[0])
y

array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1])

In [7]:
print('number of 0 : %d,     number of 1 : %d'%(np.sum(y == 0), np.sum(y == 1)))

number of 0 : 49,     number of 1 : 51


### Splitting data set

Here the data set will be splitted into 3 sections popularly known as train, validation and test. Test is optional though.

There are two available type of splitting:
1. random_split
2. cross_validation_split
3. sequence_split (used for sequential data)

#### Initialization parameters for SplitDataset class

data: pandas dataframe or series or numpy array of dimension #samples X other dimensions with rank 2 or higher

label: pandas dataframe or series or numpy array with dimension #samples X #classes with rank 2

index : numpy array, we can explicitly insert indices of the data set and labels

test_ratio : float, ratio of test data

np_seed : int, numpy seed used for random shuffle, default is 1216

same_ratio : bool, should be only true if the labels are classification labels and if we want to keep the ratio of different classes' data same in  test and validation data sets for all classes (it doesnt have any use in sequence split)

make_onehot : bool, if True, the labels will be converted into one hot encoded format

In [47]:
# defining object as splitter
splitter = msd.SplitDataset(X, y, same_ratio = True, test_ratio = .1)

## random_split

#### Parameters for random_split function
val_ratio : float, validation data set ratio, deafult is .15

In [24]:
# applying sequence split
data = splitter.random_split(val_ratio = .2)

In [25]:
for i in range(X.shape[0]): print(X[i], y[i], i)

[[93 32 12 97 28]
 [88 99 36 28 52]] 1 0
[[58 63 56 85  5]
 [92 87 18 70 46]] 0 1
[[30  8 36 85 54]
 [64 26 81 83  7]] 0 2
[[ 5 13 53 65 72]
 [50  3 71 70 19]] 1 3
[[12 67 90 75 46]
 [51 40 98 39  6]] 0 4
[[99 39 67 41 81]
 [85  6 22  4 12]] 0 5
[[99 33 38 36 51]
 [96 51 88 67  4]] 1 6
[[83 42 45 43 34]
 [73 96 89  7 53]] 1 7
[[42 63 80 43  8]
 [34 41 97 74 50]] 0 8
[[17 98 39 23 41]
 [55 78 48 77 75]] 0 9
[[76 50 75  1  2]
 [45 68 52 33 53]] 0 10
[[81 54 81 43 69]
 [45  8 88 32 51]] 1 11
[[55 44 37  8 95]
 [64 23 87  6 53]] 0 12
[[27 63 74 76 76]
 [16 43 73 21  5]] 0 13
[[70 55 90 30 89]
 [32 51 90 14 73]] 1 14
[[68 83 44 31 67]
 [51 50 32  2 43]] 0 15
[[53 90 86  7 69]
 [ 0 34 75 41 78]] 1 16
[[72 79 49 35 57]
 [32 65  5 23 69]] 1 17
[[96 60 69 36 60]
 [ 6 37  7 56 18]] 0 18
[[72 29 93 87 32]
 [36 63 71 92 61]] 1 19
[[ 2 29 85 67 14]
 [97 39 15  4 49]] 0 20
[[98 45 96 28 37]
 [78 63 27 18 51]] 0 21
[[34 33 12 98  6]
 [26 23 31 59 35]] 0 22
[[18 90 67  3  9]
 [75 46 76 49 88]] 0 23
[[

#### Checking output of split function

In [26]:
data.keys()

dict_keys(['train', 'validation', 'test'])

In [27]:
data['train'].keys()

dict_keys(['data', 'label', 'index'])

In [28]:
data['validation']['index']

array([25, 65,  4, 49, 15, 23, 76, 24, 45, 78, 14, 59, 46, 63,  0, 52, 34,
       95, 66])

In [29]:
for s in data.keys():
    print('\n############## %s #############\n'%s)
    for i in range(data[s]['data'].shape[0]):
        print(data[s]['data'][i], data[s]['label'][i], data[s]['index'][i])


############## train #############

[[87 89 88 87 29]
 [57 96 52 47 24]] [0] 89
[[55 55 72 33 43]
 [95 66 20  4 81]] [0] 91
[[96 60 69 36 60]
 [ 6 37  7 56 18]] [0] 18
[[ 2 55 87 21 84]
 [51 24 92 62 94]] [0] 82
[[99 39 67 41 81]
 [85  6 22  4 12]] [0] 5
[[19 16  8 57 30]
 [22 58  6 25  0]] [0] 84
[[67 70 99 55 50]
 [59 76 10  5 24]] [0] 56
[[30  8 36 85 54]
 [64 26 81 83  7]] [0] 2
[[34 33 12 98  6]
 [26 23 31 59 35]] [0] 22
[[12 83 37 50 81]
 [92 30 48 84 11]] [0] 69
[[55 44 37  8 95]
 [64 23 87  6 53]] [0] 12
[[10 72 21 12 72]
 [ 6 22 15 56 43]] [0] 29
[[86 45 58 50 44]
 [13 90 61 38 98]] [0] 27
[[64 91  7  3 68]
 [30 22 77 25 25]] [0] 47
[[58 63 56 85  5]
 [92 87 18 70 46]] [0] 1
[[18 55 93 70 88]
 [57 58 80 65 70]] [0] 39
[[27 76 38 19 37]
 [13 60 93 88 79]] [0] 71
[[42 63 80 43  8]
 [34 41 97 74 50]] [0] 8
[[27 63 74 76 76]
 [16 43 73 21  5]] [0] 13
[[50 20 89 30 91]
 [68 47 57 70 91]] [0] 55
[[76 50 75  1  2]
 [45 68 52 33 53]] [0] 10
[[15 94 60 81 75]
 [83 13 25 86  8]] [0] 28

## cross_validation_split

#### Parameters for cross_validation_split function
fold : int, number of folds being applied to the data, default is 5

In [48]:
# applying sequence split
data = splitter.cross_validation_split(fold = 4)

In [49]:
for i in range(X.shape[0]): print(X[i], y[i], i)

[[93 32 12 97 28]
 [88 99 36 28 52]] 1 0
[[58 63 56 85  5]
 [92 87 18 70 46]] 0 1
[[30  8 36 85 54]
 [64 26 81 83  7]] 0 2
[[ 5 13 53 65 72]
 [50  3 71 70 19]] 1 3
[[12 67 90 75 46]
 [51 40 98 39  6]] 0 4
[[99 39 67 41 81]
 [85  6 22  4 12]] 0 5
[[99 33 38 36 51]
 [96 51 88 67  4]] 1 6
[[83 42 45 43 34]
 [73 96 89  7 53]] 1 7
[[42 63 80 43  8]
 [34 41 97 74 50]] 0 8
[[17 98 39 23 41]
 [55 78 48 77 75]] 0 9
[[76 50 75  1  2]
 [45 68 52 33 53]] 0 10
[[81 54 81 43 69]
 [45  8 88 32 51]] 1 11
[[55 44 37  8 95]
 [64 23 87  6 53]] 0 12
[[27 63 74 76 76]
 [16 43 73 21  5]] 0 13
[[70 55 90 30 89]
 [32 51 90 14 73]] 1 14
[[68 83 44 31 67]
 [51 50 32  2 43]] 0 15
[[53 90 86  7 69]
 [ 0 34 75 41 78]] 1 16
[[72 79 49 35 57]
 [32 65  5 23 69]] 1 17
[[96 60 69 36 60]
 [ 6 37  7 56 18]] 0 18
[[72 29 93 87 32]
 [36 63 71 92 61]] 1 19
[[ 2 29 85 67 14]
 [97 39 15  4 49]] 0 20
[[98 45 96 28 37]
 [78 63 27 18 51]] 0 21
[[34 33 12 98  6]
 [26 23 31 59 35]] 0 22
[[18 90 67  3  9]
 [75 46 76 49 88]] 0 23
[[

#### Checking output of split function

In [50]:
data.keys()

dict_keys(['train', 'validation', 'test'])

In [51]:
data['train'].keys()

dict_keys(['data', 'label', 'index'])

In [52]:
data['train']['index'].keys()

dict_keys(['fold_1', 'fold_2', 'fold_3', 'fold_4'])

In [53]:
data['test']['index']

array([96, 98, 37, 20, 85, 87, 93, 88, 97])

In [54]:
data['train']['index']['fold_3']

array([25, 65,  4, 49, 15, 23, 76, 24, 45, 89, 91, 18, 82,  5, 84, 56,  2,
       22, 69, 12, 29, 27,  9, 38, 86, 68, 79, 67, 43, 51, 61, 21, 58, 53,
       78, 14, 59, 46, 63,  0, 52, 34, 95, 66, 64, 81, 40, 41, 62,  3, 60,
       90, 33,  6, 19, 31,  7, 54, 70, 77, 80, 72, 74, 32, 92, 11, 50, 16,
       48])

In [55]:
for s in data.keys():
    print('\n############## %s #############\n'%s)
    if s == 'test':
        for i in range(data[s]['data'].shape[0]):
            print(data[s]['data'][i], data[s]['label'][i], data[s]['index'][i])
    else:
        for k in data[s]['data'].keys():
            print('****************  %s  ***************'%k)
            for i in range(data[s]['data'][k].shape[0]):
                print(data[s]['data'][k][i], data[s]['label'][k][i], data[s]['index'][k][i])


############## train #############

****************  fold_1  ***************
[[96 60 69 36 60]
 [ 6 37  7 56 18]] [0] 18
[[ 2 55 87 21 84]
 [51 24 92 62 94]] [0] 82
[[99 39 67 41 81]
 [85  6 22  4 12]] [0] 5
[[19 16  8 57 30]
 [22 58  6 25  0]] [0] 84
[[67 70 99 55 50]
 [59 76 10  5 24]] [0] 56
[[30  8 36 85 54]
 [64 26 81 83  7]] [0] 2
[[34 33 12 98  6]
 [26 23 31 59 35]] [0] 22
[[12 83 37 50 81]
 [92 30 48 84 11]] [0] 69
[[55 44 37  8 95]
 [64 23 87  6 53]] [0] 12
[[10 72 21 12 72]
 [ 6 22 15 56 43]] [0] 29
[[86 45 58 50 44]
 [13 90 61 38 98]] [0] 27
[[64 91  7  3 68]
 [30 22 77 25 25]] [0] 47
[[58 63 56 85  5]
 [92 87 18 70 46]] [0] 1
[[18 55 93 70 88]
 [57 58 80 65 70]] [0] 39
[[27 76 38 19 37]
 [13 60 93 88 79]] [0] 71
[[42 63 80 43  8]
 [34 41 97 74 50]] [0] 8
[[27 63 74 76 76]
 [16 43 73 21  5]] [0] 13
[[50 20 89 30 91]
 [68 47 57 70 91]] [0] 55
[[76 50 75  1  2]
 [45 68 52 33 53]] [0] 10
[[15 94 60 81 75]
 [83 13 25 86  8]] [0] 28
[[37 37 34 76 65]
 [44 55 51 72 67]] [0] 42
[