In [40]:
from IPython.display import display, clear_output, HTML 
HTML('<img align="center" src="https://www.tensorflow.org/images/tf_logo_social.png" width="100%">')

### Importing the libraries and dependencies

In [1]:
from __future__ import print_function, absolute_import, division

import os
os.environ["PYTHONHASHSEED"] = str(101)
import gc
gc.enable()

import tensorflow as tf
import pathlib 
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
%matplotlib inline
np.set_printoptions(precision=4)
try:
  %tensorflow_version 2.x
except:
  pass

TensorFlow is already loaded. Please restart the runtime to change versions.


In [2]:
print(tf.__version__)

2.0.0


In [9]:
!pip install --upgrade tensorflow

Collecting tensorflow
[?25l  Downloading https://files.pythonhosted.org/packages/46/0f/7bd55361168bb32796b360ad15a25de6966c9c1beb58a8e30c01c8279862/tensorflow-2.0.0-cp36-cp36m-manylinux2010_x86_64.whl (86.3MB)
[K     |████████████████████████████████| 86.3MB 57.9MB/s 
Collecting tensorflow-estimator<2.1.0,>=2.0.0 (from tensorflow)
[?25l  Downloading https://files.pythonhosted.org/packages/95/00/5e6cdf86190a70d7382d320b2b04e4ff0f8191a37d90a422a2f8ff0705bb/tensorflow_estimator-2.0.0-py2.py3-none-any.whl (449kB)
[K     |████████████████████████████████| 450kB 39.2MB/s 
Collecting tensorboard<2.1.0,>=2.0.0 (from tensorflow)
[?25l  Downloading https://files.pythonhosted.org/packages/9b/a6/e8ffa4e2ddb216449d34cfcb825ebb38206bee5c4553d69e7bc8bc2c5d64/tensorboard-2.0.0-py3-none-any.whl (3.8MB)
[K     |████████████████████████████████| 3.8MB 30.0MB/s 
Collecting gast==0.2.2 (from tensorflow)
  Downloading https://files.pythonhosted.org/packages/4e/35/11749bf99b2d4e3cceb4d55ca22590b0d7c2c6

### Basic Mechanics

In [3]:
dataset = tf.data.Dataset.from_tensor_slices([8,3,0,8,2,1])
dataset

<TensorSliceDataset shapes: (), types: tf.int32>

In [4]:
for el in dataset:
  print(el.numpy())

8
3
0
8
2
1


In [10]:
it = iter(dataset)
print(next(it).numpy())

8


In [11]:
print(dataset.reduce(0, lambda state, value: state + value).numpy())

22


### Dataset Structure

In [12]:
dataset1 = tf.data.Dataset.from_tensor_slices(tf.random.uniform([4,10]))
dataset1.element_spec

TensorSpec(shape=(10,), dtype=tf.float32, name=None)

In [14]:
dataset2 = tf.data.Dataset.from_tensor_slices((tf.random.uniform([4,10]),
                                              tf.random.uniform([4,100], maxval=100, dtype=tf.int32)))
dataset2.element_spec

(TensorSpec(shape=(10,), dtype=tf.float32, name=None),
 TensorSpec(shape=(100,), dtype=tf.int32, name=None))

In [15]:
dataset3 = tf.data.Dataset.zip((dataset1, dataset2))
dataset3.element_spec

(TensorSpec(shape=(10,), dtype=tf.float32, name=None),
 (TensorSpec(shape=(10,), dtype=tf.float32, name=None),
  TensorSpec(shape=(100,), dtype=tf.int32, name=None)))

In [18]:
dataset4 = tf.data.Dataset.from_tensors(tf.SparseTensor([[0,0],[1,2]],values=[1,2],dense_shape=[3,4]))
dataset4.element_spec.value_type

tensorflow.python.framework.sparse_tensor.SparseTensor

In [22]:
dataset = tf.data.Dataset.from_tensor_slices(
    tf.random.uniform([4,10],minval=1,maxval=100,dtype=tf.float32)
)
print(dataset.element_spec)
for z in dataset:
  print(z.numpy())

TensorSpec(shape=(10,), dtype=tf.float32, name=None)
[58.2229 95.8636 21.5689 43.651   8.8075 47.8708 15.9966 54.3221 96.4929
  1.2176]
[97.3124 78.929  14.7597 66.1848 56.8828 15.0722 97.1652 84.7762 48.597
 37.013 ]
[27.1404 85.7748 52.0497  7.8635 42.045  66.2701 31.8707 51.1625 35.7703
 14.3304]
[98.0913  2.7425 24.2658 23.3437 78.0316 21.912  50.1817 70.9067 78.0438
 27.2492]


In [24]:
dataset2 = tf.data.Dataset.from_tensor_slices(
    (tf.random.uniform([4]),
    tf.random.uniform([4,100], maxval=100, dtype=tf.int32)
))
dataset2

<TensorSliceDataset shapes: ((), (100,)), types: (tf.float32, tf.int32)>

In [25]:
dataset2.element_spec

(TensorSpec(shape=(), dtype=tf.float32, name=None),
 TensorSpec(shape=(100,), dtype=tf.int32, name=None))

In [26]:
dataset3 = tf.data.Dataset.zip((dataset1,dataset2))
dataset3.element_spec

(TensorSpec(shape=(10,), dtype=tf.float32, name=None),
 (TensorSpec(shape=(), dtype=tf.float32, name=None),
  TensorSpec(shape=(100,), dtype=tf.int32, name=None)))

In [30]:
dataset3

<ZipDataset shapes: ((10,), ((), (100,))), types: (tf.float32, (tf.float32, tf.int32))>

In [35]:
for i in dataset3:
  print(i)

(<tf.Tensor: id=205, shape=(10,), dtype=float32, numpy=
array([0.439 , 0.226 , 0.0898, 0.8753, 0.1049, 0.3095, 0.4951, 0.8769,
       0.9157, 0.2549], dtype=float32)>, (<tf.Tensor: id=206, shape=(), dtype=float32, numpy=0.36811376>, <tf.Tensor: id=207, shape=(100,), dtype=int32, numpy=
array([55, 46, 47, 29, 68,  5, 45, 85,  7, 69, 48, 74, 10, 48,  9, 75, 52,
       59, 67, 98, 82, 49, 88,  4,  8, 70, 49, 55, 24, 95, 27, 78, 42, 85,
       74, 53, 17, 24, 89, 10, 84, 70, 91,  0, 99, 78, 57, 26, 43, 59,  7,
       39, 84, 23, 56, 17,  7, 23, 59, 11, 71, 38, 75, 86, 24, 56, 85, 33,
       40, 41, 55, 65, 61, 37, 88, 47, 19, 98, 32, 38, 92,  6, 79, 41, 75,
       28, 40, 65, 19, 93, 96,  1, 13, 28, 55,  2, 16, 85, 25, 13],
      dtype=int32)>))
(<tf.Tensor: id=208, shape=(10,), dtype=float32, numpy=
array([0.4383, 0.9911, 0.6324, 0.563 , 0.903 , 0.7714, 0.7813, 0.1077,
       0.1381, 0.6339], dtype=float32)>, (<tf.Tensor: id=209, shape=(), dtype=float32, numpy=0.7525537>, <tf.Tensor: id=2

In [36]:
for a, (b,c) in dataset3:
  print(f"{a.shape}, {b.shape}, {c.shape}")

(10,), (), (100,)
(10,), (), (100,)
(10,), (), (100,)
(10,), (), (100,)


### Reading Input Data

#### Consuming NumPy arrays

In [0]:
train, test = tf.keras.datasets.fashion_mnist.load_data()

In [0]:
images, labels = train
images = images / 255.0 
train_dataset = tf.data.Dataset.from_tensor_slices((images, labels))

In [0]:
BATCH_SIZE = 64
SHUFFLE_BUFFER_SIZE = 100

train_dataset = train_dataset.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE)

In [8]:
model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=(28, 28)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(10, activation='softmax')
])

model.compile(optimizer=tf.keras.optimizers.RMSprop(),
                loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])

model.fit(train_dataset, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fbf066870b8>

In [34]:
HTML('<p style="color:#95B9C7;"><strong>Note:</strong>  The above code snippet will embed the features and labels arrays in your TensorFlow graph as tf.constant() operations. This works well for a small dataset, but wastes memory---because the contents of the array will be copied multiple times---and can run into the 2GB limit for the tf.GraphDef protocol buffer.</p>')

#### Consuming Python Generators

In [29]:
from IPython.display import display, clear_output, HTML
HTML('<p style="color:orangered;"><strong>Caution:</strong> While this is a convienient approach it has limited portability and scalibility. It must run in the same python process that created the generator, and is still subject to the <a href="https://en.wikipedia.org/wiki/Global_interpreter_lock">Python GIL.</a></p>')

  *  The Dataset.from_generator constructor converts the python generator to a fully functional tf.data.Dataset.

  *  The constructor takes a callable as input, not an iterator. This allows it to restart the generator when it reaches the end. It takes an optional args argument, which is passed as the callable's arguments.

  *  The output_types argument is required because tf.data builds a tf.Graph internally, and graph edges require a tf.dtype.

In [0]:
def count(stop):
  i = 0
  while i < stop:
    yield i
    i += 1

In [10]:
for i in count(10):
  print(i)

0
1
2
3
4
5
6
7
8
9


In [0]:
ds_counter = tf.data.Dataset.from_generator(count, args=[50], output_types=tf.int32, output_shapes=())

In [17]:
ds_counter.element_spec

TensorSpec(shape=(), dtype=tf.int32, name=None)

In [18]:
for count_batch in ds_counter.repeat().batch(10).take(5):
  print(count_batch.numpy())

[0 1 2 3 4 5 6 7 8 9]
[10 11 12 13 14 15 16 17 18 19]
[20 21 22 23 24 25 26 27 28 29]
[30 31 32 33 34 35 36 37 38 39]
[40 41 42 43 44 45 46 47 48 49]


In [19]:
for count_batch in ds_counter.repeat().batch(5).take(5):
  print(count_batch.numpy())

[0 1 2 3 4]
[5 6 7 8 9]
[10 11 12 13 14]
[15 16 17 18 19]
[20 21 22 23 24]


In [20]:
for count_batch in ds_counter.repeat().batch(5).take(20):
  print(count_batch.numpy())

[0 1 2 3 4]
[5 6 7 8 9]
[10 11 12 13 14]
[15 16 17 18 19]
[20 21 22 23 24]
[25 26 27 28 29]
[30 31 32 33 34]
[35 36 37 38 39]
[40 41 42 43 44]
[45 46 47 48 49]
[0 1 2 3 4]
[5 6 7 8 9]
[10 11 12 13 14]
[15 16 17 18 19]
[20 21 22 23 24]
[25 26 27 28 29]
[30 31 32 33 34]
[35 36 37 38 39]
[40 41 42 43 44]
[45 46 47 48 49]


In [0]:
def gen_series():
  i = 0
  while True:
    size = np.random.randint(0, 10)
    yield i, np.random.normal(size=(size,))
    i += 1

In [42]:
for i, series in gen_series():
  print(i, ":", str(series))
  if i>5:
    break

0 : [0.8917]
1 : []
2 : [-1.685  -1.0627 -1.8413  1.4036  0.252  -1.795  -0.9282  1.3021  0.5005]
3 : [-0.2699 -2.2725 -0.038   1.7539 -2.6609 -0.4136  0.4193  2.2177]
4 : [-0.1253]
5 : []
6 : [ 0.9381  0.9124 -0.402   1.0858 -2.1528  0.4984]


In [43]:
ds_series = tf.data.Dataset.from_generator(
    gen_series,
    output_types = (tf.int32, tf.float32),
    output_shapes = ((), (None,))
)

ds_series

<DatasetV1Adapter shapes: ((), (None,)), types: (tf.int32, tf.float32)>

In [44]:
ds_series_batch = ds_series.shuffle(20).padded_batch(10, padded_shapes=([], [None]))

ids, sequence_batch = next(iter(ds_series_batch))
print(ids.numpy())
print()
print(sequence_batch.numpy())

[16  1 12  4  2 24  7  8 15 22]

[[ 0.      0.      0.      0.      0.      0.      0.      0.      0.    ]
 [-0.8091  1.3989 -0.4963 -1.0462 -0.1312 -1.0703 -0.2454 -0.1684  0.    ]
 [-0.0797 -1.0968  0.6877  0.2361 -1.4262 -0.7754 -0.8297  1.1157  0.7561]
 [ 1.3459  0.      0.      0.      0.      0.      0.      0.      0.    ]
 [ 0.3995  0.7997  0.      0.      0.      0.      0.      0.      0.    ]
 [ 2.528  -0.3075  1.3746 -2.2849 -1.3409  0.1332  0.      0.      0.    ]
 [ 0.      0.      0.      0.      0.      0.      0.      0.      0.    ]
 [ 0.604   0.5408 -0.4383  0.7487  0.1049 -0.6179  0.      0.      0.    ]
 [-1.2918  1.1521 -0.7576 -0.2808  0.      0.      0.      0.      0.    ]
 [-0.9229  0.      0.      0.      0.      0.      0.      0.      0.    ]]


##### Wrapping preprocessing.image.ImageDataGenerator as tf.data.Dataset

In [0]:
flowers = tf.keras.utils.get_file(
    'flower_photos',
    'https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz',
    untar = True
)

In [99]:
flowers

'/root/.keras/datasets/flower_photos'

In [0]:
img_gen = tf.keras.preprocessing.image.ImageDataGenerator(
    rescale=1/255.,
    rotation_range = 20
)

In [80]:
images, labels = next(img_gen.flow_from_directory(flowers))

Found 3670 images belonging to 5 classes.


In [81]:
print(images.dtype, images.shape)
print(labels.dtype, labels.shape)

float32 (32, 256, 256, 3)
float32 (32, 5)


In [104]:
ds = tf.data.Dataset.from_generator(
    img_gen.flow_from_directory, 
    output_types=(tf.float32, tf.float32),
    output_shapes=([32,256,256,3],[32,5])
)

print(ds)

<DatasetV1Adapter shapes: ((32, 256, 256, 3), (32, 5)), types: (tf.float32, tf.float32)>


In [0]:
# for value in ds.take(2):
#   print(value)

#### Consuming TFRecord Data

    The tf.data API supports a variety of file formats so that you can process large datasets that do not fit in memory. For example, the TFRecord file format is a simple record-oriented binary format that many TensorFlow applications use for training data. The tf.data.TFRecordDataset class enables you to stream over the contents of one or more TFRecord files as part of an input pipeline.

In [106]:
fsns_test_file = tf.keras.utils.get_file('fsns.tfrec', "https://storage.googleapis.com/download.tensorflow.org/data/fsns-20160927/testdata/fsns-00000-of-00001")

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/fsns-20160927/testdata/fsns-00000-of-00001


    The filenames argument to the TFRecordDataset initializer can either be a string, a list of strings, or a tf.Tensor of strings. Therefore if you have two sets of files for training and validation purposes, you can create a factory method that produces the dataset, taking filenames as an input argument:

In [0]:
dataset = tf.data.TFRecordDataset(filenames=[fsns_test_file])

In [109]:
dataset.element_spec

TensorSpec(shape=(), dtype=tf.string, name=None)

In [110]:
dataset

<TFRecordDatasetV2 shapes: (), types: tf.string>

    Many TensorFlow projects use serialized tf.train.Example records in their TFRecord files. These need to be decoded before they can be inspected:

In [114]:
raw_example = next(iter(dataset))
print(raw_example)
print(raw_example.numpy())
print(raw_example.dtype)

tf.Tensor(b'\n\xeb\xf0\x07\n\x1a\n\x10image/orig_width\x12\x06\x1a\x04\n\x02\xd8\x04\n\x1e\n\nimage/text\x12\x10\n\x0e\n\x0cRue Perreyon\n(\n\x14image/unpadded_class\x12\x10\x1a\x0e\n\x0c1\x0b\x05\x00/\x05\x15\x15\x05!\x0c\x07\n\xe6\xee\x07\n\rimage/encoded\x12\xd3\xee\x07\n\xcf\xee\x07\n\xcb\xee\x07\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x02X\x00\x00\x00\x96\x08\x02\x00\x00\x00\x8dyaf\x00\x00\x00\tpHYs\x00\x00\x00\x00\x00\x00\x00\x00\x00\x9db&2\x00\x00 \x00IDATx\x9c\xa4\xbd\xd9\x96$\xc9\x91%&\x8b\xaa\x9a\xf9\x16\x11\x99UYUh4\x9a\xe7\xb0y\x0e\x7fv^\xf9\xc0\x7f\xe0\x1b\x1f\xf8;=\xbd\xcc\x00\xa8\xca%2|1U\x95e\x1eD\xcd\xc2#\xb3\xd0 \x87V\xc8@\xf8\x12\xe6f\xaa\xb2\\\xb9\xb28\xfe\x9f\xff\xc7\x7fqw\x00\x88\x9f\xaajf\x88\x08\x00\xf7?\x19\x10\xd6#\xde\xb9\xbd\xb4=\x1c\xe7\x01wrB\xc2\xbb\xc3\xdd[k\xf1\xfe\xfb\x83\x1c\xd0\xbfy\xee\xf5\x84\xdf<\xe3w\x7f\xbd\x9dJU\xbf\xfd{\x00w\x04\x00fffw73UqG"\xfe\xe6\xbe\x00\xa0\xb5v\xf7\xa4\xc7\xef\x84\x88\xeb-\xbb\xeb\xb8\x06\x04@$"f\x8e\x8f\x16\x91X.3sw"

In [0]:
parsed = tf.train.Example.FromString(raw_example.numpy())

In [115]:
parsed

features {
  feature {
    key: "image/class"
    value {
      int64_list {
        value: 49
        value: 11
        value: 5
        value: 0
        value: 47
        value: 5
        value: 21
        value: 21
        value: 5
        value: 33
        value: 12
        value: 7
        value: 133
        value: 133
        value: 133
        value: 133
        value: 133
        value: 133
        value: 133
        value: 133
        value: 133
        value: 133
        value: 133
        value: 133
        value: 133
        value: 133
        value: 133
        value: 133
        value: 133
        value: 133
        value: 133
        value: 133
        value: 133
        value: 133
        value: 133
        value: 133
        value: 133
      }
    }
  }
  feature {
    key: "image/encoded"
    value {
      bytes_list {
        value: "\211PNG\r\n\032\n\000\000\000\rIHDR\000\000\002X\000\000\000\226\010\002\000\000\000\215yaf\000\000\000\tpHYs\000\000\000\000\000\000\0

In [116]:
parsed.features.feature['image/text']

bytes_list {
  value: "Rue Perreyon"
}

In [117]:
parsed.features.feature["image/width"]

int64_list {
  value: 600
}

In [118]:
parsed.features.feature["image/class"]

int64_list {
  value: 49
  value: 11
  value: 5
  value: 0
  value: 47
  value: 5
  value: 21
  value: 21
  value: 5
  value: 33
  value: 12
  value: 7
  value: 133
  value: 133
  value: 133
  value: 133
  value: 133
  value: 133
  value: 133
  value: 133
  value: 133
  value: 133
  value: 133
  value: 133
  value: 133
  value: 133
  value: 133
  value: 133
  value: 133
  value: 133
  value: 133
  value: 133
  value: 133
  value: 133
  value: 133
  value: 133
  value: 133
}

#### Consuming text data

    Many datasets are distributed as one or more text files. The tf.data.TextLineDataset provides an easy way to extract lines from one or more text files. Given one or more filenames, a TextLineDataset will produce one string-valued element per line of those files.

In [120]:
directory_url = 'https://storage.googleapis.com/download.tensorflow.org/data/illiad/'
file_names = ['cowper.txt', 'derby.txt', 'butler.txt']

file_path = [
             tf.keras.utils.get_file(file_name, directory_url+file_name) 
             for file_name in file_names
]

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/illiad/cowper.txt
Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/illiad/derby.txt
Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/illiad/butler.txt


In [122]:
file_path

['/root/.keras/datasets/cowper.txt',
 '/root/.keras/datasets/derby.txt',
 '/root/.keras/datasets/butler.txt']

In [126]:
dataset = tf.data.TextLineDataset(file_path)
print(dataset.element_spec)
print()
for line in dataset.take(10):
  print(line.numpy())

TensorSpec(shape=(), dtype=tf.string, name=None)

b"\xef\xbb\xbfAchilles sing, O Goddess! Peleus' son;"
b'His wrath pernicious, who ten thousand woes'
b"Caused to Achaia's host, sent many a soul"
b'Illustrious into Ades premature,'
b'And Heroes gave (so stood the will of Jove)'
b'To dogs and to all ravening fowls a prey,'
b'When fierce dispute had separated once'
b'The noble Chief Achilles from the son'
b'Of Atreus, Agamemnon, King of men.'
b"Who them to strife impell'd? What power divine?"


In [129]:
files_ds = tf.data.Dataset.from_tensor_slices(file_path)
print(files_ds)

<TensorSliceDataset shapes: (), types: tf.string>


In [130]:
lines_ds = files_ds.interleave(tf.data.TextLineDataset, cycle_length=3)
for i, line in enumerate(lines_ds.take(9)):
  if i%3==0:
    print()
  print(line.numpy())


b"\xef\xbb\xbfAchilles sing, O Goddess! Peleus' son;"
b"\xef\xbb\xbfOf Peleus' son, Achilles, sing, O Muse,"
b'\xef\xbb\xbfSing, O goddess, the anger of Achilles son of Peleus, that brought'

b'His wrath pernicious, who ten thousand woes'
b'The vengeance, deep and deadly; whence to Greece'
b'countless ills upon the Achaeans. Many a brave soul did it send'

b"Caused to Achaia's host, sent many a soul"
b'Unnumbered ills arose; which many a soul'
b'hurrying down to Hades, and many a hero did it yield a prey to dogs and'


In [131]:
titanic_file = tf.keras.utils.get_file('train.csv', "https://storage.googleapis.com/tf-datasets/titanic/train.csv")
titanic_lines = tf.data.TextLineDataset(titanic_file)

Downloading data from https://storage.googleapis.com/tf-datasets/titanic/train.csv


In [134]:
for line in titanic_lines.take(10):
  print(line.numpy())

b'survived,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone'
b'0,male,22.0,1,0,7.25,Third,unknown,Southampton,n'
b'1,female,38.0,1,0,71.2833,First,C,Cherbourg,n'
b'1,female,26.0,0,0,7.925,Third,unknown,Southampton,y'
b'1,female,35.0,1,0,53.1,First,C,Southampton,n'
b'0,male,28.0,0,0,8.4583,Third,unknown,Queenstown,y'
b'0,male,2.0,3,1,21.075,Third,unknown,Southampton,n'
b'1,female,27.0,0,2,11.1333,Third,unknown,Southampton,n'
b'1,female,14.0,1,0,30.0708,Second,unknown,Cherbourg,n'
b'1,female,4.0,1,1,16.7,Third,G,Southampton,n'


In [0]:
def survived(line):
  return tf.not_equal(tf.strings.substr(line,0,1), "0")

survivors = titanic_lines.skip(1).filter(survived)

In [139]:
x = 0
for i in survivors:
  print(i.numpy())
  x += 1
  if x>5:
    break

b'1,female,38.0,1,0,71.2833,First,C,Cherbourg,n'
b'1,female,26.0,0,0,7.925,Third,unknown,Southampton,y'
b'1,female,35.0,1,0,53.1,First,C,Southampton,n'
b'1,female,27.0,0,2,11.1333,Third,unknown,Southampton,n'
b'1,female,14.0,1,0,30.0708,Second,unknown,Cherbourg,n'
b'1,female,4.0,1,1,16.7,Third,G,Southampton,n'


#### Consuming CSV files

In [0]:
titanic_file = tf.keras.utils.get_file("train.csv", "https://storage.googleapis.com/tf-datasets/titanic/train.csv")

In [142]:
df = pd.DataFrame.from_csv(titanic_file, index_col=None)
df.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,survived,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone
0,0,male,22.0,1,0,7.25,Third,unknown,Southampton,n
1,1,female,38.0,1,0,71.2833,First,C,Cherbourg,n
2,1,female,26.0,0,0,7.925,Third,unknown,Southampton,y
3,1,female,35.0,1,0,53.1,First,C,Southampton,n
4,0,male,28.0,0,0,8.4583,Third,unknown,Queenstown,y


In [143]:
titanic_slices = tf.data.Dataset.from_tensor_slices(dict(df))
for feature_batch in titanic_slices.take(1):
  for key, value in feature_batch.items():
    print("   {!r:20s}: {}".format(key, value))

   'survived'          : 0
   'sex'               : b'male'
   'age'               : 22.0
   'n_siblings_spouses': 1
   'parch'             : 0
   'fare'              : 7.25
   'class'             : b'Third'
   'deck'              : b'unknown'
   'embark_town'       : b'Southampton'
   'alone'             : b'n'


In [144]:
titanic_batches = tf.data.experimental.make_csv_dataset(
    titanic_file, batch_size=4,
    label_name="survived"
)

Instructions for updating:
Use `tf.data.Dataset.interleave(map_func, cycle_length, block_length, num_parallel_calls=tf.data.experimental.AUTOTUNE)` instead. If sloppy execution is desired, use `tf.data.Options.experimental_determinstic`.
Instructions for updating:
Use `tf.data.Dataset.shuffle(buffer_size, seed)` followed by `tf.data.Dataset.repeat(count)`. Static tf.data optimizations will take care of using the fused implementation.


In [146]:
titanic_batches.element_spec

(OrderedDict([('sex', TensorSpec(shape=(4,), dtype=tf.string, name=None)),
              ('age', TensorSpec(shape=(4,), dtype=tf.float32, name=None)),
              ('n_siblings_spouses',
               TensorSpec(shape=(4,), dtype=tf.int32, name=None)),
              ('parch', TensorSpec(shape=(4,), dtype=tf.int32, name=None)),
              ('fare', TensorSpec(shape=(4,), dtype=tf.float32, name=None)),
              ('class', TensorSpec(shape=(4,), dtype=tf.string, name=None)),
              ('deck', TensorSpec(shape=(4,), dtype=tf.string, name=None)),
              ('embark_town',
               TensorSpec(shape=(4,), dtype=tf.string, name=None)),
              ('alone', TensorSpec(shape=(4,), dtype=tf.string, name=None))]),
 TensorSpec(shape=(4,), dtype=tf.int32, name=None))

In [147]:
for feature_batch, label_batch in titanic_batches.take(1):
  print("Survived: {}".format(label_batch))
  print("features: ")
  for key, value in feature_batch.items():
    print(" {!r:20s}: {}".format(key, value))

Survived: [0 0 0 0]
features: 
 'sex'               : [b'male' b'male' b'male' b'male']
 'age'               : [31. 43. 37. 17.]
 'n_siblings_spouses': [0 0 1 0]
 'parch'             : [0 0 0 0]
 'fare'              : [50.4958  8.05   26.      8.6625]
 'class'             : [b'First' b'Third' b'Second' b'Third']
 'deck'              : [b'A' b'unknown' b'unknown' b'unknown']
 'embark_town'       : [b'Southampton' b'Southampton' b'Southampton' b'Southampton']
 'alone'             : [b'y' b'y' b'n' b'y']


In [0]:
titanic_batches = tf.data.experimental.make_csv_dataset(titanic_file, 4, label_name="survived", select_columns=['class', 'fare', 'survived'])

In [150]:
for feature_batch, label_batch in titanic_batches.take(1):
  print("survived: {}".format(label_batch))
  for key, value in feature_batch.items():
    print("   {!r:20s}: {}".format(key, value))

survived: [0 0 0 0]
   'fare'              : [ 7.8958  9.4833 10.5    53.1   ]
   'class'             : [b'Third' b'Third' b'Second' b'First']


In [151]:
titanic_types = [tf.int32, tf.string, tf.float32, tf.int32, tf.int32, tf.float32, tf.string, tf.string, tf.string, tf.string]
dataset = tf.data.experimental.CsvDataset(titanic_file, titanic_types, header=True)
for line in dataset.take(10):
  print([item.numpy() for item in line])

[0, b'male', 22.0, 1, 0, 7.25, b'Third', b'unknown', b'Southampton', b'n']
[1, b'female', 38.0, 1, 0, 71.2833, b'First', b'C', b'Cherbourg', b'n']
[1, b'female', 26.0, 0, 0, 7.925, b'Third', b'unknown', b'Southampton', b'y']
[1, b'female', 35.0, 1, 0, 53.1, b'First', b'C', b'Southampton', b'n']
[0, b'male', 28.0, 0, 0, 8.4583, b'Third', b'unknown', b'Queenstown', b'y']
[0, b'male', 2.0, 3, 1, 21.075, b'Third', b'unknown', b'Southampton', b'n']
[1, b'female', 27.0, 0, 2, 11.1333, b'Third', b'unknown', b'Southampton', b'n']
[1, b'female', 14.0, 1, 0, 30.0708, b'Second', b'unknown', b'Cherbourg', b'n']
[1, b'female', 4.0, 1, 1, 16.7, b'Third', b'G', b'Southampton', b'n']
[0, b'male', 20.0, 0, 0, 8.05, b'Third', b'unknown', b'Southampton', b'y']


In [152]:
%%writefile missing.csv
1,2,3,4
,2,3,4
1,,3,4
1,2,,4
1,2,3,
,,,

Writing missing.csv


In [154]:
record_defaults = [999, 999, 999, 999]
dataset = tf.data.experimental.CsvDataset("missing.csv",record_defaults=record_defaults)
dataset = dataset.map(lambda *items: tf.stack(items))
dataset

<MapDataset shapes: (4,), types: tf.int32>

In [155]:
for line in dataset:
  print(line.numpy())

[1 2 3 4]
[999   2   3   4]
[  1 999   3   4]
[  1   2 999   4]
[  1   2   3 999]
[999 999 999 999]
