In [1]:
from msdlib import msd
import numpy as np
import pandas as pd

In [2]:
np.random.seed(1216)

### Creating data set for splitting that into Train, Validation and Test

In [13]:
X = pd.DataFrame(np.random.randint(100, size = (100, 2)), columns = ['x', 'y'])
X.index = [pd.Timestamp('20200312010000') + pd.Timedelta(minutes = i * 2) for i in range(X.shape[0])]

In [14]:
X.shape

(100, 2)

### Creating labels for the data set

In [15]:
y = pd.Series(np.random.randint(2, size = X.shape[0]), index = X.index)
y

2020-03-12 01:00:00    0
2020-03-12 01:02:00    1
2020-03-12 01:04:00    1
2020-03-12 01:06:00    1
2020-03-12 01:08:00    0
2020-03-12 01:10:00    0
2020-03-12 01:12:00    1
2020-03-12 01:14:00    0
2020-03-12 01:16:00    0
2020-03-12 01:18:00    1
2020-03-12 01:20:00    0
2020-03-12 01:22:00    1
2020-03-12 01:24:00    0
2020-03-12 01:26:00    1
2020-03-12 01:28:00    1
2020-03-12 01:30:00    1
2020-03-12 01:32:00    0
2020-03-12 01:34:00    0
2020-03-12 01:36:00    1
2020-03-12 01:38:00    0
2020-03-12 01:40:00    1
2020-03-12 01:42:00    1
2020-03-12 01:44:00    1
2020-03-12 01:46:00    0
2020-03-12 01:48:00    0
2020-03-12 01:50:00    1
2020-03-12 01:52:00    0
2020-03-12 01:54:00    1
2020-03-12 01:56:00    1
2020-03-12 01:58:00    1
                      ..
2020-03-12 03:20:00    1
2020-03-12 03:22:00    0
2020-03-12 03:24:00    1
2020-03-12 03:26:00    1
2020-03-12 03:28:00    0
2020-03-12 03:30:00    0
2020-03-12 03:32:00    0
2020-03-12 03:34:00    0
2020-03-12 03:36:00    1


### Splitting data set

Here the data set will be splitted into 3 sections popularly known as train, validation and test. Test is optional though.

The idea is that according to temporal order, train data comes first, after that there may be a gap which will be defined by the parameter 'dist' and then comes validation data and again there is a same amount of gap and then comes test data.

#### Initialization parameters for SplitDataset class

data: pandas dataframe or series or numpy array of dimension #samples X other dimensions with rank 2 or higher

label: pandas dataframe or series or numpy array with dimension #samples X #classes with rank 2

index : numpy array, we can explicitly insert indices of the data set and labels

test_ratio : float, ratio of test data

np_seed : int, numpy seed used for random shuffle

same_ratio : bool, should be only true if the labels are classification labels and we want to keep the test and validation ratio same for all classes (it doesnt have any use in sequence split)

make_onehot : bool, if True, the labels will be converted into one hot encoded format

In [16]:
# defining object as splitter
splitter = msd.SplitDataset(X, y, test_ratio = .1)

#### Parameters for sequence_split function
seq_len : int, length of each sequence

val_ratio : float, validation data set ratio, deafult is .15

data_stride : int, indicates the number of shift between two adjacent example data to prepare the data set, default is 1

label_shift : temporal difference between the label and the corresponding data's last time step, default is 1

split_method : {'train_val', 'multiple_train_val'}, default is 'multiple_train_val'

sec : int or pandas dataframe with columns naming 'start' and 'stop', number of sections for multiple_train_val method splitting (basically train_val splitting is multiple_train_val method with sec = 1), deafult is 1

dist : int, number of data to be leftover between two adjacent sec, default is 0

In [7]:
# applying sequence split
data = splitter.sequence_split(seq_len = 10, val_ratio = .2, data_stride = 2, label_shift = 3, 
                               split_method = 'train_val', sec = 1, dist = 1)

### Checking output of split function

In [17]:
data.keys()

dict_keys(['train', 'validation', 'test'])

In [24]:
data['train'].keys()

dict_keys(['data', 'label', 'index'])

In [25]:
data['validation']['index']

array([Timestamp('2020-03-12 03:18:00'), Timestamp('2020-03-12 03:22:00'),
       Timestamp('2020-03-12 03:26:00'), Timestamp('2020-03-12 03:30:00'),
       Timestamp('2020-03-12 03:34:00'), Timestamp('2020-03-12 03:38:00'),
       Timestamp('2020-03-12 03:42:00')], dtype=object)

In [20]:
# Accumulating data and label in order to understand the splitting easily
X['label'] = y
X.reset_index().values

array([[Timestamp('2020-03-12 01:00:00'), 81, 54, 0],
       [Timestamp('2020-03-12 01:02:00'), 81, 43, 1],
       [Timestamp('2020-03-12 01:04:00'), 69, 45, 1],
       [Timestamp('2020-03-12 01:06:00'), 8, 88, 1],
       [Timestamp('2020-03-12 01:08:00'), 32, 51, 0],
       [Timestamp('2020-03-12 01:10:00'), 55, 44, 0],
       [Timestamp('2020-03-12 01:12:00'), 37, 8, 1],
       [Timestamp('2020-03-12 01:14:00'), 95, 64, 0],
       [Timestamp('2020-03-12 01:16:00'), 23, 87, 0],
       [Timestamp('2020-03-12 01:18:00'), 6, 53, 1],
       [Timestamp('2020-03-12 01:20:00'), 27, 63, 0],
       [Timestamp('2020-03-12 01:22:00'), 74, 76, 1],
       [Timestamp('2020-03-12 01:24:00'), 76, 16, 0],
       [Timestamp('2020-03-12 01:26:00'), 43, 73, 1],
       [Timestamp('2020-03-12 01:28:00'), 21, 5, 1],
       [Timestamp('2020-03-12 01:30:00'), 70, 55, 1],
       [Timestamp('2020-03-12 01:32:00'), 90, 30, 0],
       [Timestamp('2020-03-12 01:34:00'), 89, 32, 0],
       [Timestamp('2020-03-12 01

In [23]:
for see in sees:
    print('\n############## %s #############\n'%see)
    for i in range(data[see]['data'].shape[0]):
        print(data[see]['data'][i], data[see]['label'][i], data[see]['index'][i])


############## train #############

[[12 97]
 [28 88]
 [99 36]
 [28 52]
 [58 63]
 [56 85]
 [ 5 92]
 [87 18]
 [70 46]
 [30  8]] [1] 2020-03-12 01:20:00
[[99 36]
 [28 52]
 [58 63]
 [56 85]
 [ 5 92]
 [87 18]
 [70 46]
 [30  8]
 [36 85]
 [54 64]] [0] 2020-03-12 01:24:00
[[58 63]
 [56 85]
 [ 5 92]
 [87 18]
 [70 46]
 [30  8]
 [36 85]
 [54 64]
 [26 81]
 [83  7]] [0] 2020-03-12 01:28:00
[[ 5 92]
 [87 18]
 [70 46]
 [30  8]
 [36 85]
 [54 64]
 [26 81]
 [83  7]
 [ 5 13]
 [53 65]] [0] 2020-03-12 01:32:00
[[70 46]
 [30  8]
 [36 85]
 [54 64]
 [26 81]
 [83  7]
 [ 5 13]
 [53 65]
 [72 50]
 [ 3 71]] [0] 2020-03-12 01:36:00
[[36 85]
 [54 64]
 [26 81]
 [83  7]
 [ 5 13]
 [53 65]
 [72 50]
 [ 3 71]
 [70 19]
 [12 67]] [1] 2020-03-12 01:40:00
[[26 81]
 [83  7]
 [ 5 13]
 [53 65]
 [72 50]
 [ 3 71]
 [70 19]
 [12 67]
 [90 75]
 [46 51]] [1] 2020-03-12 01:44:00
[[ 5 13]
 [53 65]
 [72 50]
 [ 3 71]
 [70 19]
 [12 67]
 [90 75]
 [46 51]
 [40 98]
 [39  6]] [1] 2020-03-12 01:48:00
[[72 50]
 [ 3 71]
 [70 19]
 [12 67]
 [90 75