# tf.data

<b> 기존의 placeholder & feed_dict 방법보다 높은 성능 (속도 측면)  
<b> 간단한 batch, epoch, shuffle, map 구현  
<b> tf.estimator 와 같이 사용하기에 적합  


### 긍정/부정 Make data (긍정: 1, 부정: 0)

In [2]:
samples = ['너 오늘 이뻐 보인다',
          '나는 오늘 기분이 더러워',
          '끝내주는데, 좋은 일이 있나봐',
          '나 오늘 좋은 일이 생겼어',
          '아 진짜 짜증나',
          '오, 이거 진짜 좋은 것 같은데']

targets = [[1], [0], [1], [1], [0], [1]]

### load need modules

In [3]:
import os
import tensorflow as tf
import numpy as np

from tensorflow.keras import preprocessing

### Data Preprocessing

In [4]:
tokenizer = preprocessing.text.Tokenizer() 
tokenizer.fit_on_texts(samples) 
sequences = tokenizer.texts_to_sequences(samples)
sequences = preprocessing.sequence.pad_sequences(sequences, maxlen = 6, padding='post')

targets = np.array(targets)

print("index text data : \n", sequences)
print("shape of sequences:", sequences.shape)

index text data : 
 [[ 5  1  6  7  0  0]
 [ 8  1  9 10  0  0]
 [11  2  3 12  0  0]
 [13  1  2  3 14  0]
 [15  4 16  0  0  0]
 [17 18  4  2 19 20]]
shape of sequences: (6, 6)


In [5]:
word_index = tokenizer.word_index

print("index of each word : \n", word_index)

index of each word : 
 {'오늘': 1, '좋은': 2, '일이': 3, '진짜': 4, '너': 5, '이뻐': 6, '보인다': 7, '나는': 8, '기분이': 9, '더러워': 10, '끝내주는데': 11, '있나봐': 12, '나': 13, '생겼어': 14, '아': 15, '짜증나': 16, '오': 17, '이거': 18, '것': 19, '같은데': 20}


In [6]:
print("targets: \n", targets)
print("shape of targets:", targets.shape) 

targets: 
 [[1]
 [0]
 [1]
 [1]
 [0]
 [1]]
shape of targets: (6, 1)


###  tf.data data processing


In [16]:
dataset = tf.data.Dataset.from_tensor_slices((sequences, targets))
iterator = dataset.make_one_shot_iterator()
next_data = iterator.get_next()

### Session run

In [28]:
with tf.Session() as sess:
    while True:
        try:
            seq, lab = next_data
            print(sess.run([seq, lab]))
        except:
            break

[array([5, 1, 6, 7, 0, 0], dtype=int32), array([1])]
[array([ 8,  1,  9, 10,  0,  0], dtype=int32), array([0])]
[array([11,  2,  3, 12,  0,  0], dtype=int32), array([1])]
[array([13,  1,  2,  3, 14,  0], dtype=int32), array([1])]
[array([15,  4, 16,  0,  0,  0], dtype=int32), array([0])]
[array([17, 18,  4,  2, 19, 20], dtype=int32), array([1])]


### shuffle data

In [32]:
dataset = tf.data.Dataset.from_tensor_slices((sequences, targets))
dataset = dataset.shuffle(len(sequences))
iterator = dataset.make_one_shot_iterator()
next_data = iterator.get_next()

In [33]:
with tf.Session() as sess:
    while True:
        try:
            seq, lab = next_data
            print(sess.run([seq, lab]))
        except:
            break

[array([5, 1, 6, 7, 0, 0], dtype=int32), array([1])]
[array([13,  1,  2,  3, 14,  0], dtype=int32), array([1])]
[array([17, 18,  4,  2, 19, 20], dtype=int32), array([1])]
[array([11,  2,  3, 12,  0,  0], dtype=int32), array([1])]
[array([15,  4, 16,  0,  0,  0], dtype=int32), array([0])]
[array([ 8,  1,  9, 10,  0,  0], dtype=int32), array([0])]


### make batch

In [36]:
BATCH_SIZE = 2

dataset = tf.data.Dataset.from_tensor_slices((sequences, targets))
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.shuffle(len(sequences))
iterator = dataset.make_one_shot_iterator()
next_data = iterator.get_next()

In [37]:
with tf.Session() as sess:
    while True:
        try:
            print(sess.run(next_data))
        except tf.errors.OutOfRangeError:
            break

(array([[15,  4, 16,  0,  0,  0],
       [17, 18,  4,  2, 19, 20]], dtype=int32), array([[0],
       [1]]))
(array([[11,  2,  3, 12,  0,  0],
       [13,  1,  2,  3, 14,  0]], dtype=int32), array([[1],
       [1]]))
(array([[ 5,  1,  6,  7,  0,  0],
       [ 8,  1,  9, 10,  0,  0]], dtype=int32), array([[1],
       [0]]))


### 2 epoch

In [38]:
BATCH_SIZE = 2
EPOCH_SIZE = 2

dataset = tf.data.Dataset.from_tensor_slices((sequences, targets))
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.shuffle(len(sequences))
iterator = dataset.make_one_shot_iterator()
next_data = iterator.get_next()

In [41]:
with tf.Session() as sess:
    while True:
        try:
            print(sess.run(next_data))
        except:
            break

(array([[ 5,  1,  6,  7,  0,  0],
       [ 8,  1,  9, 10,  0,  0]], dtype=int32), array([[1],
       [0]]))
(array([[15,  4, 16,  0,  0,  0],
       [17, 18,  4,  2, 19, 20]], dtype=int32), array([[0],
       [1]]))
(array([[11,  2,  3, 12,  0,  0],
       [13,  1,  2,  3, 14,  0]], dtype=int32), array([[1],
       [1]]))


### use map 

In [0]:
def map_fn(X, Y=None):
    inputs = {'x': X}
    label = Y
    return inputs, label

dataset = tf.data.Dataset.from_tensor_slices((sequences, targets))
dataset = dataset.map(map_fn)
iterator = dataset.make_one_shot_iterator()
next_data = iterator.get_next()

In [43]:
with tf.Session() as sess:
    while True:
        try:
            print(sess.run(next_data))
        except:
            break

(array([[15,  4, 16,  0,  0,  0],
       [17, 18,  4,  2, 19, 20]], dtype=int32), array([[0],
       [1]]))
(array([[ 5,  1,  6,  7,  0,  0],
       [ 8,  1,  9, 10,  0,  0]], dtype=int32), array([[1],
       [0]]))
(array([[11,  2,  3, 12,  0,  0],
       [13,  1,  2,  3, 14,  0]], dtype=int32), array([[1],
       [1]]))


### use map  two variable

In [47]:
def map_fn(X1, X2, Y=None):
    inputs = {'x1': X1, 'x2': X2}
    label = Y
    return inputs, label


dataset = tf.data.Dataset.from_tensor_slices((sequences, sequences, targets))
dataset = dataset.map(map_fn)
iterator = dataset.make_one_shot_iterator()
next_data = iterator.get_next()

In [48]:
with tf.Session() as sess:
    while True:
        try:
            print(sess.run(next_data))
        except:
            break

({'x1': array([5, 1, 6, 7, 0, 0], dtype=int32), 'x2': array([5, 1, 6, 7, 0, 0], dtype=int32)}, array([1]))
({'x1': array([ 8,  1,  9, 10,  0,  0], dtype=int32), 'x2': array([ 8,  1,  9, 10,  0,  0], dtype=int32)}, array([0]))
({'x1': array([11,  2,  3, 12,  0,  0], dtype=int32), 'x2': array([11,  2,  3, 12,  0,  0], dtype=int32)}, array([1]))
({'x1': array([13,  1,  2,  3, 14,  0], dtype=int32), 'x2': array([13,  1,  2,  3, 14,  0], dtype=int32)}, array([1]))
({'x1': array([15,  4, 16,  0,  0,  0], dtype=int32), 'x2': array([15,  4, 16,  0,  0,  0], dtype=int32)}, array([0]))
({'x1': array([17, 18,  4,  2, 19, 20], dtype=int32), 'x2': array([17, 18,  4,  2, 19, 20], dtype=int32)}, array([1]))


### Create All Feature (batch, shuffle, repeat, map)

In [49]:
BATCH_SIZE = 2
EPOCH_SIZE = 2

#############################################################
# 지금까지 배운 API들을 사용해 본다. 
# 1. 주어진 데이터를 tf.data에 적용시킨다.
# 2. map 사용한다.
# 3. 배치 크기 만큼 가져온다.
# 4. 순서를 섞어준다.
# 5. 전체 데이터를 EPOCH SIZE 만큼 사용한다.
# 6. 데이터를 하나씩 사용한다.
# 7. iterator의 get_next를 통해 하나씩 가져오는 구조를 만든다.
# 8. 세션을 구성해서 실행한다.
#############################################################

In [52]:
# ANSWER
BATCH_SIZE = 2
EPOCH_SIZE = 2

def map_fn(X, Y=None):
    inputs = {'x': X}
    label = Y
    return inputs, label

dataset = tf.data.Dataset.from_tensor_slices((sequences, targets))
dataset = dataset.map(map_fn)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.shuffle(len(sequences))
dataset = dataset.repeat(EPOCH_SIZE)
iterator = dataset.make_one_shot_iterator()
next_data = iterator.get_next()

In [53]:
# ANSWER
with tf.Session() as sess:
    while True:
        try:
            print(sess.run(next_data))
        except:
            break

({'x': array([[ 5,  1,  6,  7,  0,  0],
       [ 8,  1,  9, 10,  0,  0]], dtype=int32)}, array([[1],
       [0]]))
({'x': array([[15,  4, 16,  0,  0,  0],
       [17, 18,  4,  2, 19, 20]], dtype=int32)}, array([[0],
       [1]]))
({'x': array([[11,  2,  3, 12,  0,  0],
       [13,  1,  2,  3, 14,  0]], dtype=int32)}, array([[1],
       [1]]))
({'x': array([[ 5,  1,  6,  7,  0,  0],
       [ 8,  1,  9, 10,  0,  0]], dtype=int32)}, array([[1],
       [0]]))
({'x': array([[11,  2,  3, 12,  0,  0],
       [13,  1,  2,  3, 14,  0]], dtype=int32)}, array([[1],
       [1]]))
({'x': array([[15,  4, 16,  0,  0,  0],
       [17, 18,  4,  2, 19, 20]], dtype=int32)}, array([[0],
       [1]]))
