In [1]:
import matplotlib as mlp
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf
from tensorflow import keras 
import warnings

warnings.filterwarnings('ignore')
print(tf.__version__)
for model in sklearn, pd, keras, np ,mlp:
    print(model.__name__, model.__version__)

2.1.0
sklearn 0.20.2
pandas 0.24.2
tensorflow_core.python.keras.api._v2.keras 2.2.4-tf
numpy 1.17.4
matplotlib 2.1.2


In [12]:
"""
tfrecord 格式
-> tf.train.Example
    -> tf.train.Features -> {"key": tf.train.Feature}
        -> tf.train.Feature -> tf.train.ByteList/ FloatList/ Int64.List
"""
favorite_books = [name.encode('utf-8') for name in ["Machine Learning", "AA"]]
favorite_books_byteList = tf.train.BytesList(value=favorite_books)
print(favorite_books_byteList)

hour_floatList = tf.train.FloatList(value=[15.3, 9.2, 8.5])
print(hour_floatList)

age_int64List = tf.train.Int64List(value=[23])
print(age_int64List)

features = tf.train.Features(
    feature = {
        "favorite_books": tf.train.Feature(bytes_list=favorite_books_byteList),
        "hour_floatList": tf.train.Feature(float_list=hour_floatList),
        "age_int64List": tf.train.Feature(int64_list=age_int64List)
    }
)
print(features)

value: "Machine Learning"
value: "AA"

value: 15.300000190734863
value: 9.199999809265137
value: 8.5

value: 23

feature {
  key: "age_int64List"
  value {
    int64_list {
      value: 23
    }
  }
}
feature {
  key: "favorite_books"
  value {
    bytes_list {
      value: "Machine Learning"
      value: "AA"
    }
  }
}
feature {
  key: "hour_floatList"
  value {
    float_list {
      value: 15.300000190734863
      value: 9.199999809265137
      value: 8.5
    }
  }
}



In [13]:
# 有了features就可以生产Example
example = tf.train.Example(features=features)
print(example)

# 序列化
serialized_example = example.SerializeToString()
print(serialized_example)

features {
  feature {
    key: "age_int64List"
    value {
      int64_list {
        value: 23
      }
    }
  }
  feature {
    key: "favorite_books"
    value {
      bytes_list {
        value: "Machine Learning"
        value: "AA"
      }
    }
  }
  feature {
    key: "hour_floatList"
    value {
      float_list {
        value: 15.300000190734863
        value: 9.199999809265137
        value: 8.5
      }
    }
  }
}

b'\nh\n"\n\x0ehour_floatList\x12\x10\x12\x0e\n\x0c\xcd\xcctA33\x13A\x00\x00\x08A\n*\n\x0efavorite_books\x12\x18\n\x16\n\x10Machine Learning\n\x02AA\n\x16\n\rage_int64List\x12\x05\x1a\x03\n\x01\x17'


In [14]:
# 将tfrecord保存到文件
output_dir = 'tfrecord_basic'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
filename = "test.tfrecord"
filename_fullpath = os.path.join(output_dir, filename)
with tf.io.TFRecordWriter(filename_fullpath) as writer:
    for i in range(3):
        writer.write(serialized_example)

In [15]:
# 从文件中读取tfrecord
dataset = tf.data.TFRecordDataset([filename_fullpath])
for i in dataset:
    print(i)

tf.Tensor(b'\nh\n"\n\x0ehour_floatList\x12\x10\x12\x0e\n\x0c\xcd\xcctA33\x13A\x00\x00\x08A\n*\n\x0efavorite_books\x12\x18\n\x16\n\x10Machine Learning\n\x02AA\n\x16\n\rage_int64List\x12\x05\x1a\x03\n\x01\x17', shape=(), dtype=string)
tf.Tensor(b'\nh\n"\n\x0ehour_floatList\x12\x10\x12\x0e\n\x0c\xcd\xcctA33\x13A\x00\x00\x08A\n*\n\x0efavorite_books\x12\x18\n\x16\n\x10Machine Learning\n\x02AA\n\x16\n\rage_int64List\x12\x05\x1a\x03\n\x01\x17', shape=(), dtype=string)
tf.Tensor(b'\nh\n"\n\x0ehour_floatList\x12\x10\x12\x0e\n\x0c\xcd\xcctA33\x13A\x00\x00\x08A\n*\n\x0efavorite_books\x12\x18\n\x16\n\x10Machine Learning\n\x02AA\n\x16\n\rage_int64List\x12\x05\x1a\x03\n\x01\x17', shape=(), dtype=string)


In [25]:
# 将数据解析成可以理解的格式
    # 1. 定义期望解析的类型,(是否定长，什么类型 etc.)
    # 2. 解析出的结果会是稀疏的tensor，可以将其转化为非稀疏的tensor
expected_feature = {
    "favorite_books": tf.io.VarLenFeature(dtype=tf.string),
    "hour": tf.io.VarLenFeature(dtype=tf.float32),
    "age": tf.io.VarLenFeature(dtype=tf.int64),
}
dataset = tf.data.TFRecordDataset([filename_fullpath])
for serialized_example_tensor in dataset:
    example = tf.io.parse_single_example(serialized_example_tensor, expected_feature)
    print(example)
    # 稠密向量,0的位置默认填充
    books = tf.sparse.to_dense(example['favorite_books'], default_value='')
    for book in books:
        print(book.numpy().decode("utf-8"))

{'age': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x13f08ed30>, 'favorite_books': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x13f08e978>, 'hour': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x13f08ef28>}
Machine Learning
AA
{'age': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x13f08e7f0>, 'favorite_books': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x13f08e898>, 'hour': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x13f08e780>}
Machine Learning
AA
{'age': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x13f08ea58>, 'favorite_books': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x13f08e7b8>, 'hour': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x13f08ea20>}
Machine Learning
AA


In [26]:
# 保存成压缩文件
filename_fullpath_zip = filename_fullpath + ".zip"
options = tf.io.TFRecordOptions(compression_type='GZIP')
with tf.io.TFRecordWriter(filename_fullpath_zip, options=options) as writer:
    for i in range(3):
        writer.write(serialized_example)

In [27]:
# 压缩文件读取
dataset_zip = tf.data.TFRecordDataset([filename_fullpath_zip], compression_type="GZIP")
for serialized_example_tensor in dataset_zip:
    example = tf.io.parse_single_example(serialized_example_tensor, expected_feature)
    print(example)
    # 稠密向量,0的位置默认填充
    books = tf.sparse.to_dense(example['favorite_books'], default_value='')
    for book in books:
        print(book.numpy().decode("utf-8"))

{'age': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x13f098e80>, 'favorite_books': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x13f098d68>, 'hour': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x13f098d30>}
Machine Learning
AA
{'age': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x13f098cc0>, 'favorite_books': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x13f098c88>, 'hour': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x13f098c50>}
Machine Learning
AA
{'age': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x13f098cf8>, 'favorite_books': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x13f098c18>, 'hour': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x13f098be0>}
Machine Learning
AA
