In [5]:
import tensorflow as tf
x = tf.range(10)
dataset = tf.data.Dataset.from_tensor_slices(x)
for item in dataset:
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)


In [4]:
dataset = dataset.repeat(3).batch(7)
for item in dataset:
    print(item)

tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int32)
tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int32)
tf.Tensor([4 5 6 7 8 9 0], shape=(7,), dtype=int32)
tf.Tensor([1 2 3 4 5 6 7], shape=(7,), dtype=int32)
tf.Tensor([8 9], shape=(2,), dtype=int32)


In [6]:
dataset = dataset.map(lambda x: x*2)

In [None]:
dataset = dataset.apply(tf.data.experimental.unbatch())

In [None]:
dataset = dataset.filter(lambda x: x < 10)

In [8]:
for item in dataset.take(3):
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)


In [None]:
dataset = tf.data.Dataset.range(10).repeat(3)
dataset = dataset.shuffle(buffer_size=5, seed=42).batch(7) # buffer must be big enought to have 
for item in dataset:                                       # effective shuffling, small enough
    print(item)                                            # not to cause memory issues

In [None]:
# interleaving lines from multiple files, files should be equal or similar length
train_file_paths = ['file1.csv', 'file2.csv', 'file3.csv']
filepath_dataset = tf.data.Dataset.list_files(train_filepaths, seed=42)

# this reads from 5 files at a time and interleaves their lines together
# calls the lambda given here for each one
n_readers = 5
dataset = filepath_dataset.interleave(
    lambda filepath: tf.data.TextLineDataset(filepath.skip(1), cycle_length=n_readers, num_parallel_calls=5)
)

In [None]:
# scaling your data before training

n_inputs = 5
x_mean 2.0 
x_std = 1.0

def preprocess(line):
    defs = [0.] * n_inputs + [tf.constant([], dtype=tf.float32)]
    fields = tf.io.decode_csv(line, record_defaults=defs)
    x = tf.stack(fields[:-1])
    y = tf.stack(fileds[-1:])
    return (x - x_mean) / x_std, y

In [10]:
def csv_reader_dataset(filepaths, 
                       repeat=1, 
                       n_readers=5, 
                       n_read_threads=None, 
                       shuffle_buffer_size=10000,
                       n_parse_threads=5,
                       batch_size=32):
    dataset = tf.data.Dataset.list_files(filepaths)
    dataset = dataset.interleave(
        lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
        cycle_length = n_readers,
        num_parallel_calls=n_read_threads
    )
    dataset = dataset.map(preprocess, num_parallel_calls=n_parse_threads)
    dataset = dataset.shuffle(shuffle_buffer_size).repeat(repeat)
    return dataset.batch(batch_size).prefetch(1) 
# prefetch means that it is already getting ready to get the next batch when it is called
# can use .cache when data will be used multiple times

In [None]:
train_set = csv_reader_dataset(train_filepaths)
valid_set = csv_reader_dataset(valid_filepaths)
test_set = csv_reader_dataset(test_filepaths)

model = keras.models.Sequential([...])
new_set = test_set.take(3).map(lambda x, y: x) # pretending these are 3 new instances
model.predict(new_set)

In [None]:
@tf.function
def train(model, optimizer, loss_fn, n_epochs, train_filepaths):
    train_set = csv_reader_dataset(train_filepaths, repeat=n_epochs)
    for x_batch, y_batch in train_set:
        with tf.GradientTape() as tape:
            y_pred = model(x_batch)
            main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))
            loss = tf.add_n([main_loss] + model.losses)
        grads = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))
    

In [None]:
# writing to a tf record, a binary file for efficient computation

with tf.io.TFRecordWriter("my_data.tfrecord") as f:
    f.write(b"This is the first record")
    f.write(b"And this is the second record")
    
# reading from the tfrecord

filepaths = ["my_data.tfrecord"]
dataset = tf.data.TFRecordDataset(filepaths)
for item in dataset:
    print(item)

In [None]:
# compress tfrecords when you need to send them over a network

options = tf.io.TFRecords(compression_type="GZIP")
with tf.io.TFRecordWriter("my_compressed.tfrecord", options) as f:
    f.write(b"This is the first compressed record")
    f.write(b"And this is the second compressed record")

# reading from the compressed file

dataset = tf.data.TFRecordDataset(["my_compressed.tfrecord"], compression_type="GZIP")

In [None]:
# protobufs are a file type google invented

from person_pb2 import Person

person = Person(name="Al", id=123, email=["a@b.com"])
print(person)
s = person.SerializeToString()

In [None]:
# tensorflow has specific protobufs it can use

from tensorflow.train import BytesList, FloatList, Int64List
from tensorflow.train import Feature, Features, Example

person_example = Example(
    features=Features(
        feature={
            "name":Feature(bytes_list=BytesList(value=[b"Alice"])),
            "id":Feature(int64_list=Int64List(value=[123])),
            "emails":Feature(bytes_list=BytesList(value=[b"a@b.com",b"c@d.com"]))
            }
    )
)

# now we serialize and write it to a tf record

with tf.io.TFRecordWriter("my_contacts.tfrecord") as f:
    f.write(person_example.SerializeToString())

In [None]:
feature_description = {
    "name" : tf.io.FixedLenFeature([], tf.string, default_value=""),
    "id" : tf.io.FixedLenFeature([], tf.int64, default_value=0),
    "emails" : tf.io.VarLenFeature(tf.string)
}

for serialized_example in tf.data.TFRecordDataset(["my_contacts.tfrecord"]):
    parsed_example = tf.io.parse_single_example(serialized_example, feature_description)
    
# fixed length (name, id) are parsed as regular tensors
# the variable length one (emails) is parsed as a sparse tensor
# we can convert it back to regular (dense) one

tf.sparse.to_dense(parsed_example["emails"], default_value=b"")

In [None]:
# can also parse in a batch

dataset = tf.data.TFRecordDataset(["my_contacts.tfrecord"]).batch(10)
for serialized_examples in dataset:
    parsed_examples = tf.io.parse_example(serialized_examples, feature_description)

In [None]:
# lists of lists with a Sequence example protobuf

# message FeatureList {repeated Feature feature =1;}
# message FeatureLists {map<string, FeatureList> feature_list =1;}
# message SequenceExample {
#     Features context = 1;
#     FeatureLists feature_lists = 2; lists of feature lists
# }

parsed_context, parsed_feature_lists = tf.io.parse_single_sequence_example(
    serialized_sequence_example, context_feature_descriptions, sequence_feature_descriptions
)
parsed_content = tf.RaggedTensor.from_sparse(parsed_feature_lists["content"])

In [None]:
means = np.mean(x_train, axis=0, keepdims=True)
stds = np.std(x_train, axis=0, keepdims=True)
eps = keras.backend.epsilon()
model = keras.models.Sequential([
    keras.layers.Lambda(lambda inputs: (inputs-means) / (stds+eps) )
])

#better way is to create a standardization layer

class Standardization(keras.layers.Layer):
    def adapt(self, data_sample):
        self.means_ = np.mean(data_sample, axis=0, keepdims=True)
        self.stds_ = np.std(data_sample, axis=0, keepdims=True)
    def call(self, inputs):
        return (inputs - self.means_) / (self.stds_ + keras.backend.epsilon())
    
# now before you use it adapt to data

std_layer = Standardization()
std_layer.adapt(data_sample)

# now incorporate into your model

model = keras.Sequential()
model.add(std_layer)

In [None]:
vocab = ["<1H OCEAN", "INLAND", "NEAR OCEAN", "NEAR BAY", "ISLAND"]
indices = tf.range(len(vocab), dtype=tf.int64)
table_init = tf.lookup.KeyValueTensorInitializer(vocab, indices)
num_oov_buckets = 2 # out of vocab buckets, if we give it to something it can't find in the table will hash to one
table = tf.lookup.StaticVocabularyTable(table_init, num_oov_buckets) # look up table for our vocab

In [None]:
categories = tf.constant(["NEAR BAY", "DESERT", "INLAND", "INLAND"])
cat_indices = table.lookup(categories)
cat_indices
cat_one_hot = tf.one_hot(cat_indices, depth=len(vocab) + num_oov_buckets)


# This can be accomplished with the experimental method here
# tf.keras.layers.experimental.preprocessing.TextVectorization(
#     max_tokens=None, standardize=LOWER_AND_STRIP_PUNCTUATION,
#     split=SPLIT_ON_WHITESPACE, ngrams=None, output_mode=INT,
#     output_sequence_length=None, pad_to_max_tokens=True, **kwargs
# )


In [None]:
# word embeddings as vectors

embedding_dim = 2
embed_init = tf.random.uniform([len(vocab) + num_oov_buckets, embedding_dim])
embedding_matrix = tf.Variable(embed_init)
categories = tf.constant(["NEAR BAY", "DESERT", "INLAND", "INLAND"])
cat_indices = table.lookup(categories)
tf.nn.embedding_lookup(embedding_matrix, cat_indices)

In [None]:
# putting it together

regular_inputs = keras.layers.Input(shape=[8])
categories = keras.layers.Input(shape=[], dtype=tf.string)
cat_indices = keras.layers.Lambda(lambda cats: table.lookup(cats))(categories)
cat_embed = keras.layers.Embedding(input_dim=6, output_dim=2)(cat_indices)
encoded_inputs = keras.layers.concatenate([regular_inputs, cat_embed])
outputs = keras.layers.Dense(1)(encoded_inputs)
model = keras.models.Model(inputs=[regular_inputs, categories])

In [None]:
# normalization followed by discretization

normalization = keras.layers.Normalization()
discretization = keras.layers.Discretization()
pipeline = keras.layers.PreprocessingStage([normalization, discretization])
pipeline.adapt(data_sample)

In [None]:
# transform lets you write a pre defined pre processing layer and ship it wherever your model goes as part of it
# creates a tf function for you that you can incorporate into your model
import tensorflow_transform as tft

def preprocess(inputs):
    median_age = inputs["housing_median_age"]
    ocean_proximity = inputs["ocean_proximity"]
    standardized_age = tft.scale_to_z_score(median_age)
    ocean_proximity_id = tft.compute_and_apply_vocabulary(ocean_proximity)
    return {
        "standardized_medium_age" : standardized_age,
        "ocean_proximity_id" : ocean_proximity_id
    }

In [None]:
# used to download common datasets

import tensorflow_datasets as tfds

dataset = tfds.load(name="mnist")
mnist_train, mnist_test = dataset["train"], dataset["test"]
mnist_train = mnist_train.shuffle(10000).batch(32).prefetch(1)
for item in mnist_train:
    images = item["image"]
    labels = item["label"]
    # and so on...
    
# tf expects a tuple tho

mnist_train = mnist_train.shuffle(10000).batch(32)
mnist_train = mnist_train.map(lambda items : (items["image"], items["label"]))
mnist_train = mnist_train.prefetch(1)

# simplest
dataset = tfds.load(name="mnist", batch_size=32, as_supervised=True)
mnist_train = dataset["train"].prefetch(1)
model = keras.models.Sequential([...])
model.fit(mnist_train, epochs=5)