# Splitting Datasets
In supervised training, you will want to split your training set into training and evaluation samples, and only use the test set for the final model test.
To achieve that we zip a dataset with an index range, filter by a certain threshold and then map the dataset back to get rid of that index

In [None]:
import tensorflow as tf
import numpy as np

In [None]:
# You may want to rerun and should close the session, if one is open.
try: 
    sess.close()
except NameError:
    print("Don't worry. Need to ignore this error once")
sess = tf.InteractiveSession()

### Scenario
We have 20 samples with their labels and want to split at 20%

In [None]:
tr_img = np.array([[0,0],[1,1],[2,2],[3,3],[4,4],[5,5],[6,6],[7,7],[8,8],[9,9],
                  [10,0],[11,1],[12,2],[13,3],[14,4],[15,5],[16,6],[17,7],[18,8],[19,9]])
tr_lbl = np.array([0,3,2,0,2,1,2,3,0,3,0,2,1,3,1,0,3,2,3,3])
N = 20
ratio = 0.2
idx = np.array(range(N))

### Creating the indexed datasets from the numpy arrays

In [None]:
tr_img_tensor = tf.constant(tr_img)
tr_lbl_tensor = tf.constant(tr_lbl)
idx_tensor = tf.constant(idx)

tr_img_ds = tf.data.Dataset.from_tensor_slices(tr_img_tensor)
tr_lbl_ds = tf.data.Dataset.from_tensor_slices(tr_lbl_tensor)
idx_ds = tf.data.Dataset.from_tensor_slices(idx_tensor)

tr_ds = tf.data.Dataset.zip((tr_img_ds, tr_lbl_ds)).shuffle(buffer_size=N)

tr_ds_i = tf.data.Dataset.zip((tr_ds, idx_ds))

### Now we filter by the index - that's equivalent to splitting the ```Dataset```

In [None]:
ds_ev = tr_ds_i.filter(lambda x,y: y < int(N * ratio))
ds_tr = tr_ds_i.filter(lambda x,y: y >= int(N * ratio))

In [None]:
it_ev = ds_ev.map(lambda x,y: x).batch(2).make_one_shot_iterator()
it_tr = ds_tr.map(lambda x,y: x).repeat(3).batch(8).make_one_shot_iterator()

### 20% of 20 samples means 4. With batch size 2 that's *two* batches, before we run out of data 

In [None]:
sess.run(it_ev.get_next())

### The other 80% are repeated over three epochs with batch size 8. That's six batches to expect.

In [None]:
sess.run(it_tr.get_next())

### Let's implement that in a single utility method

In [None]:
def split(ds, N, ratio):
    idx = np.array(range(N))
    idx_ds = tf.data.Dataset.from_tensor_slices(tf.constant(idx))
    ds_i = tf.data.Dataset.zip((ds, idx_ds))
    ds1 = ds_i.filter(lambda x,y: y < int(N * ratio)).map(lambda x,y: x)
    ds2 = ds_i.filter(lambda x,y: y >= int(N * ratio)).map(lambda x,y: x)
    return (ds1, ds2)

In [None]:
ds = tf.data.Dataset.from_tensor_slices(tr_img_tensor)
ds1, ds2 = split(ds, 20, 0.4)
ds1 = ds1.batch(20).make_one_shot_iterator().get_next()
ds2 = ds2.batch(20).make_one_shot_iterator().get_next()
ds1, ds2 = sess.run([ds1, ds2])

### The dataset is split into 40% / 60% parts!

In [None]:
ds1, ds2