In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)

2.0.0
sys.version_info(major=3, minor=6, micro=5, releaselevel='final', serial=0)
matplotlib 2.2.2
numpy 1.19.2
pandas 0.23.0
sklearn 0.23.2
tensorflow 2.0.0
tensorflow_core.keras 2.2.4-tf


tfrecord文件格式

    -> tf.train.Example
        -> tf.train.Features -> {"key": tf.train.Feature}
            -> tf.train.Feature -> tf.train.ByteList/FloatList/Int64List

In [2]:
favorite_books = [name.encode("utf-8") for name in ["maching learning", "cc150"]]

favorite_books_bytelist = tf.train.BytesList(value = favorite_books)
print(favorite_books_bytelist)

hours_floatlist = tf.train.FloatList(value = [15.5, 9.5, 7.0, 8.0])
print(hours_floatlist)

age_int64list = tf.train.Int64List(value = [42])
print(age_int64list)

features = tf.train.Features(
    feature = {
        "favorite_books": tf.train.Feature(
            bytes_list = favorite_books_bytelist),
        "hours": tf.train.Feature(
            float_list = hours_floatlist),
        
        "age": tf.train.Feature(
            int64_list = age_int64list),
    }
)

print(features)

value: "maching learning"
value: "cc150"

value: 15.5
value: 9.5
value: 7.0
value: 8.0

value: 42

feature {
  key: "age"
  value {
    int64_list {
      value: 42
    }
  }
}
feature {
  key: "favorite_books"
  value {
    bytes_list {
      value: "maching learning"
      value: "cc150"
    }
  }
}
feature {
  key: "hours"
  value {
    float_list {
      value: 15.5
      value: 9.5
      value: 7.0
      value: 8.0
    }
  }
}



In [3]:
example = tf.train.Example(features = features)
print(example)

# 序列化
serialized_example = example.SerializeToString()
print(serialized_example)

features {
  feature {
    key: "age"
    value {
      int64_list {
        value: 42
      }
    }
  }
  feature {
    key: "favorite_books"
    value {
      bytes_list {
        value: "maching learning"
        value: "cc150"
      }
    }
  }
  feature {
    key: "hours"
    value {
      float_list {
        value: 15.5
        value: 9.5
        value: 7.0
        value: 8.0
      }
    }
  }
}

b'\n\\\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10maching learning\n\x05cc150\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00xA\x00\x00\x18A\x00\x00\xe0@\x00\x00\x00A\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01*'


tf.io.TFRecordWriter(filename_fullpath) 输出文本为tfrecord格式

tf.data.TFRecordDataset([filename_fullpath]) 根据路径读取tfrecord格式文件

输出tfrecord文件

In [6]:
output_dir = 'tfrecord_basic'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
    
filename = "test.tfrecords"
filename_fullpath = os.path.join(output_dir, filename)
with tf.io.TFRecordWriter(filename_fullpath) as writer:
    for i in range(3):
        writer.write(serialized_example)

读取tfrecord文件

In [7]:
dataset = tf.data.TFRecordDataset([filename_fullpath])
for serialized_example_tensor in dataset:
    print(serialized_example_tensor)

tf.Tensor(b'\n\\\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10maching learning\n\x05cc150\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00xA\x00\x00\x18A\x00\x00\xe0@\x00\x00\x00A\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01*', shape=(), dtype=string)
tf.Tensor(b'\n\\\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10maching learning\n\x05cc150\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00xA\x00\x00\x18A\x00\x00\xe0@\x00\x00\x00A\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01*', shape=(), dtype=string)
tf.Tensor(b'\n\\\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10maching learning\n\x05cc150\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00xA\x00\x00\x18A\x00\x00\xe0@\x00\x00\x00A\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01*', shape=(), dtype=string)


为了解析每个输入样本每一列数据，需要定义一个解析字典。

tensorflow提供了三种方式：

    FixedLenFeature、VarLenFeature、FixedLenSequenceFeature，

    分别解析定长特征、变长特征、定长序列特征。

FixedLenFeature() 函数有三个参数：

（1）shape：输入数据的shape。

（2）dtype：输入的数据类型。

（3）default_value：如果示例缺少此功能，则使用该值。它必须与dtype和指定shape兼容。


parse_single_example(
    serialized,
    features,
    name=None,
    example_names=None
)

参数：

    serialized：一个标量字符串张量，单个序列化的例子。
    features：一个 dict，映射功能键到 FixedLenFeature 或 VarLenFeature值。
    name：此操作的名称（可选）。
    example_names:(可选）标量字符串张量，关联的名称。
    返回：

一个 dict，映射功能键到 Tensor 和 SparseTensor 值。

tf.FixedLenFeature这种方法解析的结果为一个 Tensor，tf .VarLenFeature这种方法得到的解析

结果为 SparseTensor ，用于处理稀疏数据。

In [8]:
expected_features = {
    "favorite_books": tf.io.VarLenFeature(dtype = tf.string),
    "hours": tf.io.VarLenFeature(dtype = tf.float32),
    "age": tf.io.FixedLenFeature([], dtype = tf.int64),
}

dataset = tf.data.TFRecordDataset([filename_fullpath])
for serialized_example_tensor in dataset:
    example = tf.io.parse_single_example(
        serialized_example_tensor,
        expected_features
    )
    
    print(example)

{'favorite_books': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x0000000012534B38>, 'hours': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x0000000012538E80>, 'age': <tf.Tensor: id=46, shape=(), dtype=int64, numpy=42>}
{'favorite_books': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x0000000012506208>, 'hours': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x00000000126B8A58>, 'age': <tf.Tensor: id=55, shape=(), dtype=int64, numpy=42>}
{'favorite_books': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x000000000BD03780>, 'hours': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x00000000125061D0>, 'age': <tf.Tensor: id=64, shape=(), dtype=int64, numpy=42>}


In [11]:
for serialized_example_tensor in dataset:
    example = tf.io.parse_single_example(
        serialized_example_tensor,
        expected_features
    )
    
    books = tf.sparse.to_dense(example["favorite_books"], default_value=b"")

    for book in books:
        print(book.numpy().decode("utf-8"))

maching learning
cc150
maching learning
cc150
maching learning
cc150


输出为zip文件

In [14]:
filename_fullpath_zip = filename_fullpath + ".zip"
options = tf.io.TFRecordOptions(compression_type="GZIP")

with tf.io.TFRecordWriter(filename_fullpath_zip, options) as writer:
    for i in range(3):
        writer.write(serialized_example)

读取为zip文件

In [15]:
dataset_zip = tf.data.TFRecordDataset([filename_fullpath_zip],
                       compression_type="GZIP")

for serialized_example_tensor in dataset_zip:
    tf.io.parse_single_example(
        serialized_example_tensor,
        expected_features
    )
    
    books = tf.sparse.to_dense(example["favorite_books"],
                              default_value = b"")
    
    for book in books:
        print(book.numpy().decode("utf-8"))


maching learning
cc150
maching learning
cc150
maching learning
cc150
