In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [10]:
options = tf.io.TFRecordOptions(compression_type="GZIP")
with tf.io.TFRecordWriter("tfRecordFile/my_data.tfrecord", options) as f:
    f.write(b"this is the first record")
    f.write(b"this is the second record")

In [11]:
filepaths = ["tfRecordFile/my_data.tfrecord"]
dataset = tf.data.TFRecordDataset(filepaths, compression_type="GZIP")
for item in dataset:
    print(item)

tf.Tensor(b'this is the first record', shape=(), dtype=string)
tf.Tensor(b'this is the second record', shape=(), dtype=string)


In [16]:
%%writefile tfRecordFile/person.proto
syntax = "proto3";
message Person {
  string name = 1;
  int32 id = 2;
  repeated string email = 3;
}

Overwriting tfRecordFile/person.proto


In [19]:
from tfRecordFile.person_pb2 import Person

In [21]:
person = Person(name="Al", id=123, email=["a@b.com"])  # create a Person
print(person)  # display the Person
print(person.name)
print(person.email[0])
person.email.append("c@d.com")
s = person.SerializeToString()
print(s)
person2 = Person()
print(person2.ParseFromString(s))
print(person == person2)

name: "Al"
id: 123
email: "a@b.com"

Al
a@b.com
b'\n\x02Al\x10{\x1a\x07a@b.com\x1a\x07c@d.com'
24
True


In [22]:
person_tf = tf.io.decode_proto(
    bytes=s,
    message_type="Person",
    field_names=["name", "id", "email"],
    output_types=[tf.string, tf.int32, tf.string],
    descriptor_source="tfRecordFile/person.desc")
print(person_tf.values)

[<tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Al'], dtype=object)>, <tf.Tensor: shape=(1,), dtype=int32, numpy=array([123])>, <tf.Tensor: shape=(2,), dtype=string, numpy=array([b'a@b.com', b'c@d.com'], dtype=object)>]


In [26]:
from tensorflow.train import BytesList, FloatList, Int64List
from tensorflow.train import Feature, Features, Example

In [28]:
%%writefile tfRecordFile/tfProto.proto
syntax = "proto3";
message BytesList { repeated bytes value = 1; }
message FloatList { repeated float value = 1 [packed = true]; }
message Int64List { repeated int64 value = 1 [packed = true]; }
message Feature {
    oneof kind {
        BytesList bytes_list = 1;
        FloatList float_list = 2;
        Int64List int64_list = 3;
    }
};
message Features { map<string, Feature> feature = 1; };
message Example { Features features = 1; };

Writing tfRecordFile/tfProto.proto


In [50]:
print('type of b\'123\' is ' + str(type(b'123')))
print('type of \'123\' is ' + str(type('123')))

type of b'123' is <class 'bytes'>
type of '123' is <class 'str'>


In [54]:
name_feature = Feature(bytes_list = BytesList(value = [b"Alice"]))
id_feature = Feature(int64_list = Int64List(value = [123]))
emails_feature = Feature(bytes_list = BytesList(value = [b'a@b.com', b'c@d.com']))
feature_dict = {"name":name_feature, 'id':id_feature, 'emails':emails_feature}
person_features = Features(feature = feature_dict)
person_example = Example(features = person_features)

person_example_ = Example(
    features=Features(
    feature={
    "name": Feature(bytes_list=BytesList(value=[b"Alice"])),
    "id": Feature(int64_list=Int64List(value=[123])),
    "emails": Feature(bytes_list=BytesList(value=[b"a@b.com",
    b"c@d.com"]))
    }))
person_example == person_example_

with tf.io.TFRecordWriter("tfRecordFile/my_contacts.tfrecord") as f:
    f.write(person_example.SerializeToString())
    f.write(person_example.SerializeToString())

    
# The fixed-length features are parsed as regular tensors, but the variablelength
# features are parsed as sparse tensors. You can convert a sparse
# tensor to a dense tensor using tf.sparse.to_dense(),
feature_description = {
    "name": tf.io.FixedLenFeature([], tf.string, default_value=""),
    "id": tf.io.FixedLenFeature([], tf.int64, default_value=0),
    "emails": tf.io.VarLenFeature(tf.string),
}

feature_record = tf.data.TFRecordDataset(["tfRecordFile/my_contacts.tfrecord"])
print(feature_record)
for serialized_example in feature_record:
    parsed_example = tf.io.parse_single_example(serialized_example, feature_description)
    print(parsed_example)
    print(tf.sparse.to_dense(parsed_example["emails"], default_value=b""))
    print(parsed_example["emails"].values)
print('\n')

feature_record_batch = tf.data.TFRecordDataset(["tfRecordFile/my_contacts.tfrecord"]).batch(10)
for serialized_examples in feature_record_batch:
    parsed_examples = tf.io.parse_example(serialized_examples,feature_description)
    print(parsed_examples)
    print(tf.sparse.to_dense(parsed_example["emails"], default_value=b""))
    print(parsed_example["emails"].values)

<TFRecordDatasetV2 shapes: (), types: tf.string>
{'emails': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x0000023FAC9419A0>, 'id': <tf.Tensor: shape=(), dtype=int64, numpy=123>, 'name': <tf.Tensor: shape=(), dtype=string, numpy=b'Alice'>}
tf.Tensor([b'a@b.com' b'c@d.com'], shape=(2,), dtype=string)
tf.Tensor([b'a@b.com' b'c@d.com'], shape=(2,), dtype=string)
{'emails': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x0000023FAC962910>, 'id': <tf.Tensor: shape=(), dtype=int64, numpy=123>, 'name': <tf.Tensor: shape=(), dtype=string, numpy=b'Alice'>}
tf.Tensor([b'a@b.com' b'c@d.com'], shape=(2,), dtype=string)
tf.Tensor([b'a@b.com' b'c@d.com'], shape=(2,), dtype=string)


{'emails': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x0000023FAC962D00>, 'id': <tf.Tensor: shape=(2,), dtype=int64, numpy=array([123, 123], dtype=int64)>, 'name': <tf.Tensor: shape=(2,), dtype=string, numpy=array([b'Alice', b'Alice'], dtype=object)>}
tf.Te

In [52]:
%%writefile tfRecordFile/tfProtoSeq.proto
message FeatureList { repeated Feature feature = 1; };
message FeatureLists { map<string, FeatureList> feature_list = 1; };
message SequenceExample {
    Features context = 1;
    FeatureLists feature_lists = 2;
};

Overwriting tfRecordFile/tfProtoSeq.proto


In [60]:
from tensorflow.train import FeatureList, FeatureLists, SequenceExample
context = Features(feature={
    "author_id": Feature(int64_list=Int64List(value=[123])),
    "title": Feature(bytes_list=BytesList(value=[b"A", b"desert", b"place", b"."])),
    "pub_date": Feature(int64_list=Int64List(value=[1623, 12, 25]))
})

content = [["When", "shall", "we", "three", "meet", "again", "?"],
           ["In", "thunder", ",", "lightning", ",", "or", "in", "rain", "?"]]
comments = [["When", "the", "hurlyburly", "'s", "done", "."],
            ["When", "the", "battle", "'s", "lost", "and", "won", "."]]

def words_to_feature(words):
    return Feature(bytes_list=BytesList(value=[word.encode("utf-8") for word in words]))

content_features = [words_to_feature(sentence) for sentence in content]
comments_features = [words_to_feature(comment) for comment in comments]
            
sequence_example = SequenceExample(
    context=context,
    feature_lists=FeatureLists(feature_list={
        "content": FeatureList(feature=content_features),
        "comments": FeatureList(feature=comments_features)
    }))
print(sequence_example)

with tf.io.TFRecordWriter("tfRecordFile/my_seq.tfrecord") as f:
    f.write(sequence_example.SerializeToString())
    f.write(sequence_example.SerializeToString())


    
serialized_sequence_examples = tf.data.TFRecordDataset(["tfRecordFile/my_seq.tfrecord"])
print(serialized_sequence_examples)
context_feature_descriptions = {
    "author_id": tf.io.FixedLenFeature([], tf.int64, default_value=0),
    "title": tf.io.VarLenFeature(tf.string),
    "pub_date": tf.io.FixedLenFeature([3], tf.int64, default_value=[0, 0, 0]),
}
sequence_feature_descriptions = {
    "content": tf.io.VarLenFeature(tf.string),
    "comments": tf.io.VarLenFeature(tf.string),
}

# If the feature lists contain sequences of varying sizes (as in the preceding example), you may
# want to convert them to ragged tensors, using tf.RaggedTensor.from_sparse()
for serialized_sequence_example in serialized_sequence_examples:
    parsed_context, parsed_feature_lists = tf.io.parse_single_sequence_example( serialized_sequence_example, 
            context_feature_descriptions, sequence_feature_descriptions)
    print(parsed_context)
    print(parsed_feature_lists)
    print(tf.RaggedTensor.from_sparse(parsed_feature_lists["content"]))
print('\n')


serialized_sequence_examples_batch = tf.data.TFRecordDataset(["tfRecordFile/my_seq.tfrecord"]).batch(10)
print(serialized_sequence_examples_batch)
for serialized_sequence_example_batch in serialized_sequence_examples_batch:
    print(serialized_sequence_example_batch)
    parsed_sequence_example_batch = tf.io.parse_sequence_example(serialized_sequence_example_batch, 
            context_feature_descriptions, sequence_feature_descriptions)
    print(parsed_sequence_example_batch)
    print(type(parsed_sequence_example_batch))
#     print(parsed_context)
#     print(parsed_feature_lists)
#     print(tf.RaggedTensor.from_sparse(parsed_feature_lists["content"]))
    

context {
  feature {
    key: "author_id"
    value {
      int64_list {
        value: 123
      }
    }
  }
  feature {
    key: "pub_date"
    value {
      int64_list {
        value: 1623
        value: 12
        value: 25
      }
    }
  }
  feature {
    key: "title"
    value {
      bytes_list {
        value: "A"
        value: "desert"
        value: "place"
        value: "."
      }
    }
  }
}
feature_lists {
  feature_list {
    key: "comments"
    value {
      feature {
        bytes_list {
          value: "When"
          value: "the"
          value: "hurlyburly"
          value: "\'s"
          value: "done"
          value: "."
        }
      }
      feature {
        bytes_list {
          value: "When"
          value: "the"
          value: "battle"
          value: "\'s"
          value: "lost"
          value: "and"
          value: "won"
          value: "."
        }
      }
    }
  }
  feature_list {
    key: "content"
    value {
      feature {
      