In [2]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras


In [3]:
# tfrecord file format
# -> tf.train.Example
#   -> tf.train.Features -> { "key": tf.train.Feature }
#     -> tf.train.Feature -> tf.train.ByteList/FloatList/Int64List

favorite_books = [name.encode("utf-8") for name in ["Java", "How to", "cooking"]]
favorite_books_bytelist = tf.train.BytesList(value = favorite_books)
print(favorite_books_bytelist)
hours_floatlist = tf.train.FloatList(value=[1.0, 2.3, 6.0])
print(hours_floatlist)
age_int64list = tf.train.Int64List(value=[43])
print(age_int64list)

features = tf.train.Features(
    feature = {
        "favorite_books": tf.train.Feature(bytes_list = favorite_books_bytelist),
        "hours": tf.train.Feature(float_list = hours_floatlist),
        "age": tf.train.Feature(int64_list = age_int64list)
    }
)
print(features)

value: "Java"
value: "How to"
value: "cooking"

value: 1.0
value: 2.299999952316284
value: 6.0

value: 43

feature {
  key: "age"
  value {
    int64_list {
      value: 43
    }
  }
}
feature {
  key: "favorite_books"
  value {
    bytes_list {
      value: "Java"
      value: "How to"
      value: "cooking"
    }
  }
}
feature {
  key: "hours"
  value {
    float_list {
      value: 1.0
      value: 2.299999952316284
      value: 6.0
    }
  }
}



In [4]:
example = tf.train.Example(features = features)
print(example)

serialized_example = example.SerializeToString()
print(serialized_example)

features {
  feature {
    key: "age"
    value {
      int64_list {
        value: 43
      }
    }
  }
  feature {
    key: "favorite_books"
    value {
      bytes_list {
        value: "Java"
        value: "How to"
        value: "cooking"
      }
    }
  }
  feature {
    key: "hours"
    value {
      float_list {
        value: 1.0
        value: 2.299999952316284
        value: 6.0
      }
    }
  }
}

b'\nV\n\x19\n\x05hours\x12\x10\x12\x0e\n\x0c\x00\x00\x80?33\x13@\x00\x00\xc0@\n+\n\x0efavorite_books\x12\x19\n\x17\n\x04Java\n\x06How to\n\x07cooking\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01+'


In [5]:
output_dir = "tfrecord_basic"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
    
filename = "test.tfrecords"
file_path = os.path.join(output_dir, filename)
with tf.io.TFRecordWriter(file_path) as writer:
    for i in range(4):
        writer.write(serialized_example)

In [7]:
expected_features = {
    "favorite_books": tf.io.VarLenFeature(dtype=tf.string),
    "hours": tf.io.VarLenFeature(dtype = tf.float32),
    "age": tf.io.FixedLenFeature([], dtype=tf.int64)
}
dataset = tf.data.TFRecordDataset([file_path])
for serialized_example_tensor in dataset:
    example = tf.io.parse_single_example(
        serialized_example_tensor,
        expected_features
    )
    books = tf.sparse.to_dense(example["favorite_books"], default_value=b"")
    for b in books:
        print(b.numpy().decode("utf-8"))

Java
How to
cooking
Java
How to
cooking
Java
How to
cooking
Java
How to
cooking


In [14]:
file_path_zip = filename + ".zip"
options = tf.io.TFRecordOptions(compression_type="GZIP")
with tf.io.TFRecordWriter(file_path_zip, options) as writer:
    for i in range(2):
        writer.write(serialized_example)

In [16]:
dataset_zip = tf.data.TFRecordDataset([file_path_zip], compression_type="GZIP")
for serialized_example_tensor in dataset_zip:
    example = tf.io.parse_single_example(
        serialized_example_tensor,
        expected_features
    )
    books = tf.sparse.to_dense(example["favorite_books"], default_value=b"")
    for b in books:
        print(b.numpy().decode("utf-8"))

Java
How to
cooking
Java
How to
cooking
