## 15. tf data api 정리2

- https://teddylee777.github.io/tensorflow/dataset-batch-window

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp
import datetime
import warnings; warnings.filterwarnings('ignore')
plt.style.use('ggplot')
%matplotlib inline

# Options for pandas
pd.options.display.max_columns = 150

In [2]:
import tensorflow as tf
import tensorflow.keras as keras # 케라스 쓸때 반드시 이것 추가... 주의https://stackoverflow.com/questions/72409779/modulenotfounderror-no-module-named-tensorflow-keras-i-tried-almost-everyth

- tf.data는 데이터 입력 파이프 라인 빌드를 위한 텐서플로우의 서브패키지, 혹은 다른 말로 API이다. 로컬 파일이나 메모리에 올려져 있는 데이터를 모델에 집어넣기 적합한 텐서로 변환하는 작업을 한다.    
- 하위 tf.data.dataset 은 tf.data의 추상 클래스로써 데이터의 병렬 처리가 용이한 형태, 즉 GPU가 연산이 끝나면 다음 데이터를 바로바로 가져다가(Pre-Fetch) 빠르게 처리할 수 있도록 고안되었다.  
- Numpy나 Pandas, 혹은 Tensorflow 2.0 부터된 아예 통합된 Keras 등 모델에 집어넣기 위해 데이터 전처리(Pre-Processing) 용도로 자주 쓰이고 그만큼 유명한 여러 모듈들이 있다. 하지만 텐서플로우에서 제공하는 tf.data.dataset 클래스가 성능적으로는 가장 최적화되어 있다고 볼 수 있다. 원래는 텐서플로우에서 제공하는 클래스는 아니고 Contributor가 제공한 것이지만, 1.8인가서부터는 공식적으로 제공되고 있다.

### Dimension 1만큼 늘리기

1) tf.expand_dim

In [3]:
x = np.arange(20)

In [4]:
x

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

In [5]:
tf.expand_dims(x, 1)

<tf.Tensor: shape=(20, 1), dtype=int32, numpy=
array([[ 0],
       [ 1],
       [ 2],
       [ 3],
       [ 4],
       [ 5],
       [ 6],
       [ 7],
       [ 8],
       [ 9],
       [10],
       [11],
       [12],
       [13],
       [14],
       [15],
       [16],
       [17],
       [18],
       [19]])>

In [6]:
tf.expand_dims(x, 1).shape

TensorShape([20, 1])

2) np.expand_dims 

In [7]:
np.expand_dims(x, 1).shape

(20, 1)

In [8]:
x.reshape(-1, 1).shape

(20, 1)

3) from_tensor_slices로 차원 늘리기: list, np.array를 tensor데이터셋으로 변환하고 차원을 늘려줌(쪼개줌)

In [9]:
ds = tf.data.Dataset.from_tensor_slices(x)

In [10]:
for i in ds:
    print(i)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)
tf.Tensor(10, shape=(), dtype=int32)
tf.Tensor(11, shape=(), dtype=int32)
tf.Tensor(12, shape=(), dtype=int32)
tf.Tensor(13, shape=(), dtype=int32)
tf.Tensor(14, shape=(), dtype=int32)
tf.Tensor(15, shape=(), dtype=int32)
tf.Tensor(16, shape=(), dtype=int32)
tf.Tensor(17, shape=(), dtype=int32)
tf.Tensor(18, shape=(), dtype=int32)
tf.Tensor(19, shape=(), dtype=int32)


### 2. batch

모델을 학습시킬 때 size만큼 데이터를 읽어 들여 학습할 떄 유용,
큰 데이터는 메모리에 한번에 올라가지 못하기 때문에 배치로 나누어 학습하기도 함

In [11]:
ds = tf.data.Dataset.range(10)

In [12]:
for i in ds:
    print(i)

tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor(1, shape=(), dtype=int64)
tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor(3, shape=(), dtype=int64)
tf.Tensor(4, shape=(), dtype=int64)
tf.Tensor(5, shape=(), dtype=int64)
tf.Tensor(6, shape=(), dtype=int64)
tf.Tensor(7, shape=(), dtype=int64)
tf.Tensor(8, shape=(), dtype=int64)
tf.Tensor(9, shape=(), dtype=int64)


In [13]:
ds

<RangeDataset shapes: (), types: tf.int64>

In [14]:
ds = ds.batch(3, drop_remainder=True)

In [15]:
ds

<BatchDataset shapes: (3,), types: tf.int64>

In [16]:
for i in ds:
    print(i)

tf.Tensor([0 1 2], shape=(3,), dtype=int64)
tf.Tensor([3 4 5], shape=(3,), dtype=int64)
tf.Tensor([6 7 8], shape=(3,), dtype=int64)


In [17]:
ds.as_numpy_iterator()

<tensorflow.python.data.ops.dataset_ops._NumpyIterator at 0x125820d1040>

In [18]:
list(ds.as_numpy_iterator())

[array([0, 1, 2], dtype=int64),
 array([3, 4, 5], dtype=int64),
 array([6, 7, 8], dtype=int64)]

### window : 시계열 데이터에 유용

- window : 그룹화 할 윈도우 크기  
- shift : iteration 당 몇개씩 이동할 지

In [19]:
ds = tf.data.Dataset.range(10)
ds = ds.window(5, shift=1, drop_remainder=False)

In [20]:
ds

<WindowDataset shapes: DatasetSpec(TensorSpec(shape=(), dtype=tf.int64, name=None), TensorShape([])), types: DatasetSpec(TensorSpec(shape=(), dtype=tf.int64, name=None), TensorShape([]))>

In [21]:
for i in ds:
    print(list(i.as_numpy_iterator()))

[0, 1, 2, 3, 4]
[1, 2, 3, 4, 5]
[2, 3, 4, 5, 6]
[3, 4, 5, 6, 7]
[4, 5, 6, 7, 8]
[5, 6, 7, 8, 9]
[6, 7, 8, 9]
[7, 8, 9]
[8, 9]
[9]


In [22]:
ds = tf.data.Dataset.range(10) 
ds = ds.window(5, shift=1, drop_remainder=True)
for d in ds:
    print(list(d.as_numpy_iterator()))

[0, 1, 2, 3, 4]
[1, 2, 3, 4, 5]
[2, 3, 4, 5, 6]
[3, 4, 5, 6, 7]
[4, 5, 6, 7, 8]
[5, 6, 7, 8, 9]


In [23]:
ds = tf.data.Dataset.range(10) 
ds = ds.window(5, shift=2, drop_remainder=True)
for d in ds:
    print(list(d.as_numpy_iterator()))

[0, 1, 2, 3, 4]
[2, 3, 4, 5, 6]
[4, 5, 6, 7, 8]


### flat_map

dataset에 함수를 apply 해주고 결과를 flatten하게 펼쳐준다.   
이 말은 쉽게 설명하면 원래 같은 경우 5 -> 4 -> 3 -> 처럼 iter 형식으로 받을 수 있었는데, flat_map을 사용하면 [5, 4, 3, 2, 1]로 바로 받을 수 있습니다.

In [24]:
ds = tf.data.Dataset.range(10)
ds = ds.window(5, shift=1, drop_remainder = True)
ds = ds.flat_map(lambda w: w.batch(3))

In [25]:
for d in ds:
    print(d)

tf.Tensor([0 1 2], shape=(3,), dtype=int64)
tf.Tensor([3 4], shape=(2,), dtype=int64)
tf.Tensor([1 2 3], shape=(3,), dtype=int64)
tf.Tensor([4 5], shape=(2,), dtype=int64)
tf.Tensor([2 3 4], shape=(3,), dtype=int64)
tf.Tensor([5 6], shape=(2,), dtype=int64)
tf.Tensor([3 4 5], shape=(3,), dtype=int64)
tf.Tensor([6 7], shape=(2,), dtype=int64)
tf.Tensor([4 5 6], shape=(3,), dtype=int64)
tf.Tensor([7 8], shape=(2,), dtype=int64)
tf.Tensor([5 6 7], shape=(3,), dtype=int64)
tf.Tensor([8 9], shape=(2,), dtype=int64)


In [26]:
ds = tf.data.Dataset.range(10) 
ds = ds.window(5, shift=1, drop_remainder=True)
ds = ds.flat_map(lambda w: w.batch(5))
for d in ds:
    print(d)

tf.Tensor([0 1 2 3 4], shape=(5,), dtype=int64)
tf.Tensor([1 2 3 4 5], shape=(5,), dtype=int64)
tf.Tensor([2 3 4 5 6], shape=(5,), dtype=int64)
tf.Tensor([3 4 5 6 7], shape=(5,), dtype=int64)
tf.Tensor([4 5 6 7 8], shape=(5,), dtype=int64)
tf.Tensor([5 6 7 8 9], shape=(5,), dtype=int64)


### shuffle

In [27]:
# shuffle을 해주지 않은 경우
ds = tf.data.Dataset.from_tensor_slices(np.arange(10))#.shuffle()

In [28]:
ds

<TensorSliceDataset shapes: (), types: tf.int32>

In [29]:
for d in ds:
    print(d)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)


In [30]:
# shuffle 설정
ds = tf.data.Dataset.from_tensor_slices(np.arange(10)).shuffle(buffer_size=5)
for d in ds:
    print(d)

tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)


buffer_size는 반드시 지정해야 함   
- 데이터셋은 buffer_size 요소로 버퍼로 채운 다음, 이 버퍼에서 요소를 무작위로 샘플링하여 선택한 요소를 새 요소로 바꾼다.   
- 완벽한 셔플링을 위해서는 데이터셋 전체 크기보다 크거나 같은 버퍼 크기가 필요  
- 데이터 집합에 10,000 개의 요소가 있지만 buffer_size가 1,000으로 설정된 경우 셔플은 처음에 버퍼의 처음 1,000 개 요소 중 임의의 요소 만 선택합니다.  
- 요소가 선택되면 버퍼의 공간이 다음 요소 (즉, 1,001-st)로 대체되어 1,000 요소 버퍼를 유지합니다.

### map

Dataset 전체에 함수를 맵핑

In [31]:
window_size = 5

ds = tf.data.Dataset.range(10)
ds = ds.window(window_size, shift=1, drop_remainder=True)
ds = ds.flat_map(lambda w : w.batch(window_size))
ds = ds.shuffle(10)

# 첫 4개와 마지막 1개를 분리
ds = ds.map(lambda x : (x[:-1], x[-1]))

for x, y in ds:
    print('train set: {}'.format(x))
    print('label set: {}'.format(y))

train set: [0 1 2 3]
label set: 4
train set: [3 4 5 6]
label set: 7
train set: [2 3 4 5]
label set: 6
train set: [1 2 3 4]
label set: 5
train set: [5 6 7 8]
label set: 9
train set: [4 5 6 7]
label set: 8


In [32]:
window_size = 5

ds = tf.data.Dataset.range(10)
ds = ds.window(window_size, shift=1, drop_remainder=True)
ds = ds.flat_map(lambda w : w.batch(window_size))
ds = ds.shuffle(10)

# 첫 4개와 마지막 1개를 분리
ds = ds.map(lambda x : (x[:-1], x[-1:]))

for x, y in ds:
    print('train set: {}'.format(x))
    print('label set: {}'.format(y))

train set: [1 2 3 4]
label set: [5]
train set: [2 3 4 5]
label set: [6]
train set: [3 4 5 6]
label set: [7]
train set: [4 5 6 7]
label set: [8]
train set: [5 6 7 8]
label set: [9]
train set: [0 1 2 3]
label set: [4]


- 차이 확인요..

### 실습 예제 : 실습 예제: Sunspots 데이터셋을 활용하여 window_dataset 만들기

In [33]:
import urllib
import csv

In [34]:
url = 'https://storage.googleapis.com/download.tensorflow.org/data/Sunspots.csv'
urllib.request.urlretrieve(url, 'sunspots.csv')

('sunspots.csv', <http.client.HTTPMessage at 0x12582116940>)

In [35]:
with open('sunspots.csv') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader)
    i = 0
    for row in reader:
        print(row)
        i+=1
        if i > 5:
            break

['0', '1749-01-31', '96.7']
['1', '1749-02-28', '104.3']
['2', '1749-03-31', '116.7']
['3', '1749-04-30', '92.8']
['4', '1749-05-31', '141.7']
['5', '1749-06-30', '139.2']


2번 인덱스를 timeseries 데이터로 만든다.

In [36]:
train_data = []

with open('sunspots.csv') as csvfile:
    reader = csv.reader(csvfile, delimiter = ',')
    next(reader)
    for row in reader:
        train_data.append(float(row[2]))

In [37]:
train_data

[96.7,
 104.3,
 116.7,
 92.8,
 141.7,
 139.2,
 158.0,
 110.5,
 126.5,
 125.8,
 264.3,
 142.0,
 122.2,
 126.5,
 148.7,
 147.2,
 150.0,
 166.7,
 142.3,
 171.7,
 152.0,
 109.5,
 105.5,
 125.7,
 116.7,
 72.5,
 75.5,
 94.0,
 101.2,
 84.5,
 110.5,
 99.7,
 39.2,
 38.7,
 47.5,
 73.3,
 58.3,
 83.3,
 118.3,
 98.8,
 99.5,
 66.0,
 130.7,
 48.8,
 45.2,
 77.7,
 62.7,
 66.7,
 73.3,
 53.3,
 76.2,
 63.3,
 60.0,
 52.8,
 36.7,
 65.0,
 46.7,
 41.7,
 33.3,
 11.2,
 0.0,
 5.0,
 2.8,
 22.8,
 34.5,
 44.5,
 31.3,
 20.5,
 13.7,
 40.2,
 22.0,
 7.0,
 17.0,
 18.7,
 11.3,
 10.8,
 0.0,
 0.0,
 14.3,
 5.3,
 29.7,
 39.5,
 11.3,
 33.3,
 20.8,
 11.8,
 9.0,
 15.7,
 20.8,
 21.5,
 6.0,
 10.7,
 19.7,
 23.8,
 28.3,
 15.7,
 23.5,
 35.3,
 43.7,
 50.0,
 63.5,
 21.3,
 41.7,
 85.5,
 66.2,
 54.2,
 107.8,
 55.8,
 62.7,
 86.7,
 81.7,
 120.5,
 77.3,
 75.0,
 73.3,
 64.5,
 104.2,
 62.8,
 71.7,
 71.7,
 80.5,
 73.3,
 78.0,
 78.3,
 81.7,
 83.3,
 85.0,
 118.8,
 128.7,
 99.5,
 77.2,
 95.0,
 112.2,
 99.2,
 124.5,
 97.2,
 120.0,
 80.5,
 110.0,


데이터를 list에 담았다.   
Dataset 모듈을 통해 window_dataset을 만든다.

In [38]:
train_data = np.asarray(train_data)
train_data.shape

train_data = np.expand_dims(train_data, 1)
train_data.shape

(3235, 1)

In [39]:
train_data

array([[ 96.7],
       [104.3],
       [116.7],
       ...,
       [ 13.2],
       [ 15.9],
       [  1.6]])

In [40]:
dataset = tf.data.Dataset.from_tensor_slices(train_data)

In [41]:
i = 0
for data in dataset:
    print(data)
    i+=1
    if i > 5:
        break

tf.Tensor([96.7], shape=(1,), dtype=float64)
tf.Tensor([104.3], shape=(1,), dtype=float64)
tf.Tensor([116.7], shape=(1,), dtype=float64)
tf.Tensor([92.8], shape=(1,), dtype=float64)
tf.Tensor([141.7], shape=(1,), dtype=float64)
tf.Tensor([139.2], shape=(1,), dtype=float64)


내 과거의 20일 데이터를 보고 21번째 데이터를 예측한다면 윈도우 사이즈를 21로 설정함

In [42]:
window_size = 20 + 1
dataset = dataset.window(window_size, shift=1, drop_remainder=True)

In [43]:
for ds in dataset:
    print(ds)

<_VariantDataset shapes: (1,), types: tf.float64>
<_VariantDataset shapes: (1,), types: tf.float64>
<_VariantDataset shapes: (1,), types: tf.float64>
<_VariantDataset shapes: (1,), types: tf.float64>
<_VariantDataset shapes: (1,), types: tf.float64>
<_VariantDataset shapes: (1,), types: tf.float64>
<_VariantDataset shapes: (1,), types: tf.float64>
<_VariantDataset shapes: (1,), types: tf.float64>
<_VariantDataset shapes: (1,), types: tf.float64>
<_VariantDataset shapes: (1,), types: tf.float64>
<_VariantDataset shapes: (1,), types: tf.float64>
<_VariantDataset shapes: (1,), types: tf.float64>
<_VariantDataset shapes: (1,), types: tf.float64>
<_VariantDataset shapes: (1,), types: tf.float64>
<_VariantDataset shapes: (1,), types: tf.float64>
<_VariantDataset shapes: (1,), types: tf.float64>
<_VariantDataset shapes: (1,), types: tf.float64>
<_VariantDataset shapes: (1,), types: tf.float64>
<_VariantDataset shapes: (1,), types: tf.float64>
<_VariantDataset shapes: (1,), types: tf.float64>


<_VariantDataset shapes: (1,), types: tf.float64>
<_VariantDataset shapes: (1,), types: tf.float64>
<_VariantDataset shapes: (1,), types: tf.float64>
<_VariantDataset shapes: (1,), types: tf.float64>
<_VariantDataset shapes: (1,), types: tf.float64>
<_VariantDataset shapes: (1,), types: tf.float64>
<_VariantDataset shapes: (1,), types: tf.float64>
<_VariantDataset shapes: (1,), types: tf.float64>
<_VariantDataset shapes: (1,), types: tf.float64>
<_VariantDataset shapes: (1,), types: tf.float64>
<_VariantDataset shapes: (1,), types: tf.float64>
<_VariantDataset shapes: (1,), types: tf.float64>
<_VariantDataset shapes: (1,), types: tf.float64>
<_VariantDataset shapes: (1,), types: tf.float64>
<_VariantDataset shapes: (1,), types: tf.float64>
<_VariantDataset shapes: (1,), types: tf.float64>
<_VariantDataset shapes: (1,), types: tf.float64>
<_VariantDataset shapes: (1,), types: tf.float64>
<_VariantDataset shapes: (1,), types: tf.float64>
<_VariantDataset shapes: (1,), types: tf.float64>


In [44]:
dataset = dataset.flat_map(lambda w : w.batch(window_size + 1))

In [45]:
for data in dataset.take(2):
    print(data)

tf.Tensor(
[[ 96.7]
 [104.3]
 [116.7]
 [ 92.8]
 [141.7]
 [139.2]
 [158. ]
 [110.5]
 [126.5]
 [125.8]
 [264.3]
 [142. ]
 [122.2]
 [126.5]
 [148.7]
 [147.2]
 [150. ]
 [166.7]
 [142.3]
 [171.7]
 [152. ]], shape=(21, 1), dtype=float64)
tf.Tensor(
[[104.3]
 [116.7]
 [ 92.8]
 [141.7]
 [139.2]
 [158. ]
 [110.5]
 [126.5]
 [125.8]
 [264.3]
 [142. ]
 [122.2]
 [126.5]
 [148.7]
 [147.2]
 [150. ]
 [166.7]
 [142.3]
 [171.7]
 [152. ]
 [109.5]], shape=(21, 1), dtype=float64)


In [46]:
dataset = dataset.shuffle(500)

In [47]:
dataset = dataset.map(lambda x : (x[: -1], x[-1:]))

In [48]:
for train, label in dataset.take(2):
    print('train: {}'.format(train))
    print('label: {}'.format(label))

train: [[126.5]
 [125.8]
 [264.3]
 [142. ]
 [122.2]
 [126.5]
 [148.7]
 [147.2]
 [150. ]
 [166.7]
 [142.3]
 [171.7]
 [152. ]
 [109.5]
 [105.5]
 [125.7]
 [116.7]
 [ 72.5]
 [ 75.5]
 [ 94. ]]
label: [[101.2]]
train: [[85.5]
 [47.5]
 [29.2]
 [11. ]
 [13.2]
 [23.3]
 [29.5]
 [20.3]
 [ 7.3]
 [ 0. ]
 [19.3]
 [18.7]
 [ 6.5]
 [20.5]
 [ 1.7]
 [13.2]
 [ 5.3]
 [ 9.3]
 [25.2]
 [13.2]]
label: [[36.2]]


In [49]:
for d in dataset.batch(10).take(2):
    print(d)

(<tf.Tensor: shape=(10, 20, 1), dtype=float64, numpy=
array([[[ 53.7],
        [ 33.3],
        [ 30. ],
        [ 13.3],
        [ 25. ],
        [ 17.5],
        [ 21.7],
        [ 13.3],
        [ 18.3],
        [ 16.7],
        [ 10. ],
        [ 15. ],
        [ 10. ],
        [ 16.7],
        [ 16.7],
        [ 13.3],
        [ 28.3],
        [ 23.3],
        [ 10.8],
        [ 13.3]],

       [[153.7],
        [ 63.3],
        [ 95. ],
        [128.8],
        [ 93.7],
        [ 84.2],
        [131. ],
        [102.2],
        [106.7],
        [ 91. ],
        [ 48.3],
        [ 85.3],
        [ 54.8],
        [ 68.5],
        [ 47.3],
        [ 46.2],
        [ 21.2],
        [ 48.8],
        [ 43.8],
        [ 68.2]],

       [[ 71.2],
        [129.5],
        [129. ],
        [ 87.7],
        [111.3],
        [124.7],
        [129.7],
        [151. ],
        [186.3],
        [123.2],
        [107. ],
        [107.2],
        [161.2],
        [122.7],
        [157.3],
       

종합하여 함수형으로 만들기

In [50]:
train_data = []

with open('sunspots.csv') as csvfile:
    reader = csv.reader(csvfile, delimiter = ',')
    
    next(reader)
    for row in reader:
        train_data.append(float(row[2]))

In [51]:
train_data

[96.7,
 104.3,
 116.7,
 92.8,
 141.7,
 139.2,
 158.0,
 110.5,
 126.5,
 125.8,
 264.3,
 142.0,
 122.2,
 126.5,
 148.7,
 147.2,
 150.0,
 166.7,
 142.3,
 171.7,
 152.0,
 109.5,
 105.5,
 125.7,
 116.7,
 72.5,
 75.5,
 94.0,
 101.2,
 84.5,
 110.5,
 99.7,
 39.2,
 38.7,
 47.5,
 73.3,
 58.3,
 83.3,
 118.3,
 98.8,
 99.5,
 66.0,
 130.7,
 48.8,
 45.2,
 77.7,
 62.7,
 66.7,
 73.3,
 53.3,
 76.2,
 63.3,
 60.0,
 52.8,
 36.7,
 65.0,
 46.7,
 41.7,
 33.3,
 11.2,
 0.0,
 5.0,
 2.8,
 22.8,
 34.5,
 44.5,
 31.3,
 20.5,
 13.7,
 40.2,
 22.0,
 7.0,
 17.0,
 18.7,
 11.3,
 10.8,
 0.0,
 0.0,
 14.3,
 5.3,
 29.7,
 39.5,
 11.3,
 33.3,
 20.8,
 11.8,
 9.0,
 15.7,
 20.8,
 21.5,
 6.0,
 10.7,
 19.7,
 23.8,
 28.3,
 15.7,
 23.5,
 35.3,
 43.7,
 50.0,
 63.5,
 21.3,
 41.7,
 85.5,
 66.2,
 54.2,
 107.8,
 55.8,
 62.7,
 86.7,
 81.7,
 120.5,
 77.3,
 75.0,
 73.3,
 64.5,
 104.2,
 62.8,
 71.7,
 71.7,
 80.5,
 73.3,
 78.0,
 78.3,
 81.7,
 83.3,
 85.0,
 118.8,
 128.7,
 99.5,
 77.2,
 95.0,
 112.2,
 99.2,
 124.5,
 97.2,
 120.0,
 80.5,
 110.0,


In [52]:
def windowed_dataset(data, window_size, batch_size, shuffle_buffer):
    data = np.expand_dims(data, axis=1)
    ds = tf.data.Dataset.from_tensor_slices(data)
    ds = ds.window(window_size + 1, shift=1, drop_remainder=True)
    ds = ds.flat_map(lambda w: w.batch(window_size + 1))
    ds = ds.shuffle(shuffle_buffer)
    ds = ds.map(lambda w: (w[:-1], w[-1:]))
    return ds.batch(batch_size)

In [53]:
window_ds = windowed_dataset(train_data, window_size=20, batch_size=10, shuffle_buffer=500)

In [54]:
for ds in window_ds.take(1):
    print(ds)

(<tf.Tensor: shape=(10, 20, 1), dtype=float64, numpy=
array([[[ 57.3],
        [ 73.8],
        [ 50. ],
        [ 50. ],
        [ 50. ],
        [ 47. ],
        [ 46.7],
        [ 43.3],
        [ 42.8],
        [ 40. ],
        [ 43.3],
        [ 41.7],
        [ 36.7],
        [ 33.7],
        [ 33.3],
        [ 45. ],
        [ 49.5],
        [ 26.7],
        [ 23.3],
        [ 23.3]],

       [[ 11.3],
        [ 10.8],
        [  0. ],
        [  0. ],
        [ 14.3],
        [  5.3],
        [ 29.7],
        [ 39.5],
        [ 11.3],
        [ 33.3],
        [ 20.8],
        [ 11.8],
        [  9. ],
        [ 15.7],
        [ 20.8],
        [ 21.5],
        [  6. ],
        [ 10.7],
        [ 19.7],
        [ 23.8]],

       [[133.8],
        [134.5],
        [158.3],
        [186.7],
        [193.7],
        [177.5],
        [243.3],
        [262.2],
        [295.5],
        [182.2],
        [223.3],
        [241.7],
        [398.2],
        [286. ],
        [255. ],
       

- 10은 batch_size
- 20은 window_size (train data)
- 1은 label

## 또다른 예제

- https://blog.devgenius.io/starting-with-tensorflow-datasets-part-1-an-intro-to-tf-datasets-9a26e2db4995

In [55]:
list_data = tf.data.Dataset.from_tensor_slices([1, 2, 3],)

In [56]:
list_data

<TensorSliceDataset shapes: (), types: tf.int32>

In [57]:
for ds in list_data:
    print(ds)

tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)


In [60]:
#Creating a tensorflow dataset using a numpy array
np_data = tf.data.Dataset.from_tensor_slices(np.array([[1, 2, 3],
                                                       [4, 5, 6,],
                                                       [7, 8, 9],
                                                       ]
                                               ))

In [61]:
for ds in np_data:
    print(ds)

tf.Tensor([1 2 3], shape=(3,), dtype=int32)
tf.Tensor([4 5 6], shape=(3,), dtype=int32)
tf.Tensor([7 8 9], shape=(3,), dtype=int32)


In [62]:
# element spec으로 데이터 확인
list_data.element_spec

TensorSpec(shape=(), dtype=tf.int32, name=None)

In [63]:
np_data.element_spec

TensorSpec(shape=(3,), dtype=tf.int32, name=None)

이제 실제적인 데이터를 다뤄보자.

In [64]:
train, test = tf.keras.datasets.mnist.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


In [68]:
images, lables = train

In [69]:
images.shape

(60000, 28, 28)

In [70]:
images = images/255.0

In [71]:
# Create your dataset
mnist_dataset = tf.data.Dataset.from_tensor_slices((images, lables))
# Element inspection
mnist_dataset.element_spec

(TensorSpec(shape=(28, 28), dtype=tf.float64, name=None),
 TensorSpec(shape=(), dtype=tf.uint8, name=None))

In [73]:
mnist_dataset

<TensorSliceDataset shapes: ((28, 28), ()), types: (tf.float64, tf.uint8)>

as_numpy_iterator() : 데이터의 텐서를 numpy로 변환해서 보여준다.

In [74]:
for X, y in mnist_dataset.as_numpy_iterator():
    print("Shape of X, y")
    print(X.shape, y.shape)
    print(type(X), ", This number is -->", y)
    break # to avoid iterating over all the training examples

Shape of X, y
(28, 28) ()
<class 'numpy.ndarray'> , This number is --> 5


#### 자주사용하는 메서드들

1) range 

In [75]:
range_dataset = tf.data.Dataset.range(5)

In [76]:
range_dataset

<RangeDataset shapes: (), types: tf.int64>

In [77]:
print(range_dataset.as_numpy_iterator())

<tensorflow.python.data.ops.dataset_ops._NumpyIterator object at 0x000001282B64D520>


In [78]:
print(list(range_dataset.as_numpy_iterator()))

[0, 1, 2, 3, 4]


2) map : map(map_func, )

In [81]:
def div_by_4(x):
    return x/4

In [82]:
range_divided = range_dataset.map(div_by_4)

In [83]:
print(list(range_dataset.as_numpy_iterator()))

[0, 1, 2, 3, 4]


3) take

In [85]:
first_six_images_taken = mnist_dataset.take(6)

In [86]:
first_six_images_taken.element_spec

(TensorSpec(shape=(28, 28), dtype=tf.float64, name=None),
 TensorSpec(shape=(), dtype=tf.uint8, name=None))

In [87]:
print(list(first_six_images_taken.as_numpy_iterator()))

[(array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.  

In [88]:
for X, y in first_six_images_taken.as_numpy_iterator():
    print(X.shape, y)

(28, 28) 5
(28, 28) 0
(28, 28) 4
(28, 28) 1
(28, 28) 9
(28, 28) 2


4) skip

In [89]:
for X,y in mnist_dataset.skip(3).take(3).as_numpy_iterator():
   print(X.shape, y)

(28, 28) 1
(28, 28) 9
(28, 28) 2


5) repeat: repeat( count=None, name=None)

In [90]:
dataset2 = tf.data.Dataset.range(3)
repeated_dataset = dataset2.repeat(3)
print(repeated_dataset.element_spec)
print(list(repeated_dataset.as_numpy_iterator()))

TensorSpec(shape=(), dtype=tf.int64, name=None)
[0, 1, 2, 0, 1, 2, 0, 1, 2]


6) shuffle:shuffle( buffer_size, seed=None, reshuffle_each_iteration=None, name=None)

In [91]:
dataset3 = tf.data.Dataset.range(3)
print(list(dataset3.as_numpy_iterator()))
print(list(dataset3.shuffle(3).as_numpy_iterator()))

[0, 1, 2]
[0, 2, 1]


7) zip : zip(datasets,)

In [93]:
np_array = np.array([[1, 2, 3],
                    [4, 5, 6],
                    [7, 8, 9]])
labels = np.array([0, 0, 1])
d_ds = tf.data.Dataset.from_tensor_slices(np_array)
lab_tf = tf.data.Dataset.from_tensor_slices(labels)
# Zipping 2 tf datasets
zipped_dataset = tf.data.Dataset.zip((d_ds, lab_tf))
print('Array   , Label')
for x,y in zipped_dataset.as_numpy_iterator():
    print(x, ',' ,y)

Array   , Label
[1 2 3] , 0
[4 5 6] , 0
[7 8 9] , 1


8) batch:batch( batch_size, drop_remainder=False, num_parallel_calls=None, deterministic=None,name=None)

In [94]:
# drop_remainder = False
batched_ds = zipped_dataset.batch(2)
print('batched dataset, lables')
for i in batched_ds:
  print(i[0].shape, '        ,  ',i[1].shape)
print()
print('Elements of the batched dataset')
for X,y in batched_ds.as_numpy_iterator():
  print('X= ', X)
  print('y= ',y)

batched dataset, lables
(2, 3)         ,   (2,)
(1, 3)         ,   (1,)

Elements of the batched dataset
X=  [[1 2 3]
 [4 5 6]]
y=  [0 0]
X=  [[7 8 9]]
y=  [1]


In [95]:
# drop_remainder=True
batched_ds = zipped_dataset.batch(2, drop_remainder=True)
print('batched dataset, lables')
for i in batched_ds:
  print(i[0].shape, i[1].shape)
print()
print('Elements of the batched dataset')
for X,y in batched_ds.as_numpy_iterator():
  print('X= ', X)
  print('y= ',y)

batched dataset, lables
(2, 3) (2,)

Elements of the batched dataset
X=  [[1 2 3]
 [4 5 6]]
y=  [0 0]


9) window : window( size, shift=None, stride=1, drop_remainder=False, name=None)

데이터셋 윈도우를 리턴한다.  
각 윈도우는 입력 데이터의 서브셋으로 구성된 데이터셋이다.

시계열을 다룰 때 유용하다.

In [96]:
data = tf.data.Dataset.range(10)
print('Orginal data', list(data.as_numpy_iterator()))
#  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

Orginal data [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [97]:
windowed_data = data.window(5, shift=1, drop_remainder=True)

In [98]:
windowed_data

<WindowDataset shapes: DatasetSpec(TensorSpec(shape=(), dtype=tf.int64, name=None), TensorShape([])), types: DatasetSpec(TensorSpec(shape=(), dtype=tf.int64, name=None), TensorShape([]))>

In [103]:
for data in windowed_data:
    print(data)

<_VariantDataset shapes: (), types: tf.int64>
<_VariantDataset shapes: (), types: tf.int64>
<_VariantDataset shapes: (), types: tf.int64>
<_VariantDataset shapes: (), types: tf.int64>
<_VariantDataset shapes: (), types: tf.int64>
<_VariantDataset shapes: (), types: tf.int64>


In [104]:
for window_data in windowed_data:
    for val in window_data:
        print(val.numpy(), end=" ")
    print()

0 1 2 3 4 
1 2 3 4 5 
2 3 4 5 6 
3 4 5 6 7 
4 5 6 7 8 
5 6 7 8 9 


In [105]:
shift = 2

In [106]:
for window_data in windowed_data:
    for val in window_data:
        print(val.numpy(), end=" ")
    print()

0 1 2 3 4 
1 2 3 4 5 
2 3 4 5 6 
3 4 5 6 7 
4 5 6 7 8 
5 6 7 8 9 


실습

In [107]:
ndataset = tf.data.Dataset.range(10)
ndataset = ndataset.window(5, shift=1, drop_remainder=True)
ndataset = ndataset.flat_map(lambda window: window.batch(5))
ndataset = ndataset.map(lambda window: (window[:-1], window[-1:]))
ndataset = ndataset.shuffle(buffer_size=10)
ndataset = ndataset.batch(3).prefetch(1)

In [108]:
for x, y in ndataset:
    print("x = ", list(x.numpy()))
    print("y = ", list(y.numpy()))

x =  [array([5, 6, 7, 8], dtype=int64), array([2, 3, 4, 5], dtype=int64), array([4, 5, 6, 7], dtype=int64)]
y =  [array([9], dtype=int64), array([6], dtype=int64), array([8], dtype=int64)]
x =  [array([0, 1, 2, 3], dtype=int64), array([1, 2, 3, 4], dtype=int64), array([3, 4, 5, 6], dtype=int64)]
y =  [array([4], dtype=int64), array([5], dtype=int64), array([7], dtype=int64)]
