In [2]:
from GeneralTools.misc_fun import FLAGS

FLAGS.TENSORFLOW_VERSION


'1.13.0'

In [5]:
import tensorflow as tf
"""
Session

For TensorFlow 1.x versions, the default process is to declare a graph and 
call a session to run it.
"""

a = tf.constant([3.0, 2.0], dtype=tf.float32)
b = a * 2.0
b1 = tf.multiply(a, 2.0, name='b')
# rerun this block, you will see the results change, why?
print('The value of b is {}'.format(b))
print('The shape of b is {}'.format(b.get_shape().as_list()))  # is b a row vector or column?
print("The name/operation of b is '{}'".format(b.name))
print("The name/operation of b is '{}'".format(b1.name))


The value of b is Tensor("mul_3:0", shape=(2,), dtype=float32)
The shape of b is [2]
The name/operation of b is 'mul_3:0'
The name/operation of b is 'b_3:0'


In [2]:
sess = tf.Session()
bv = sess.run(b)
print('The value of b is {}'.format(bv))
# remember to call sess.close() by the end of code to release the resources
sess.close()


The value of b is [6. 4.]


In [6]:
"""
Interactive session saves the effort of calling sess.run many times
Instead, we call node.eval() for any node if the graph
"""
sess = tf.InteractiveSession()
print('The value of b is {}'.format(b.eval()))
sess.close()


The value of b is [6. 4.]
The value of b is [6. 4.]


In [2]:
# To avoid calling sess.close()
with tf.Session() as sess:
    bv = sess.run(b)
    print('The value of b is {}'.format(bv))
    print('The value of b is {}'.format(b.eval()))

print('whatever')

The value of b is [6. 4.]
The value of b is [6. 4.]


In [1]:
"""
Starting from TensorFlow 2.0, the default process is eager execution.
That is, TF evaluates operations immediately without declaring it in the graph.
For TensorFlow 1.x versions, we can manually call eager model.
See the below link for more info:
https://www.tensorflow.org/guide/eager

Eager execution has to be called at the program startup.
Once called, all following operations are executed in eager mode.

It is generally believed that graph mode is comparable or more efficient than eager mode.
"""
import tensorflow as tf
tf.enable_eager_execution()

a = tf.constant([3.0, 2.0], dtype=tf.float32)
b = a * 2.0
# rerun this block, you will see the results change, why?
print('The value of b is {}'.format(b))
print('The shape of b is {}'.format(b.shape))  # instead of b.get_shape().as_list()
# print("The name/operation of b is '{}'".format(b.name)) --> 
# Tensor.name is meaningless when eager execution is enabled.


The value of b is [6. 4.]
The shape of b is (2,)


In [1]:
import tensorflow as tf
import numpy as np
"""
Data pipeline

It is highly suggested to use the tf.data API to handle large datasets
    tf.data.Dataset 
        reads data from one or several source files
        does pre-processing
    tf.data.Iterator
        defines an iterator that reads a batch of data at each run

If the dataset is small, say, 200 MB, 
it is OK to load the dataset into memory (placeholder would be better).
If the dataset is large, almost 1 GB to many GBs,
it is highly encouraged to store the dataset in disk.
"""
# example 1, small dataset, save to graph
graph = tf.Graph()
with graph.as_default():
    mnist = tf.keras.datasets.mnist
    (x_train, y_train),(x_test, y_test) = mnist.load_data()
    # for this small dataset, we can do pre-processing here.
    # but for illustration purpose, we handle it later
    # x_train, x_test = x_train / 255.0, x_test / 255.0
    print('x_train has shape {}, data type {} and range {}'.format(
        x_train.shape, x_train.dtype, 
        (np.amin(x_train), np.amax(x_train))))
    print('y_train has shape {} and data type {}'.format(
        y_train.shape, y_train.dtype))
    print('The shape of x_test is {}'.format(x_test.shape))
    print('The shape of x_test is {}'.format(x_test.shape))


x_train has shape (60000, 28, 28), data type uint8 and range (0, 255)
y_train has shape (60000,) and data type uint8
The shape of x_test is (10000, 28, 28)
The shape of x_test is (10000, 28, 28)


In [2]:
with graph.as_default():
    """
    All tensors in a dataset must have the same number of samples
    float32 is often used in TF
    Input to from_tensor_slices does not have to be a dictionary; 
    tensor, tuple, list are acceptable.
    """
    dataset_tr = tf.data.Dataset.from_tensor_slices(
        {"features": tf.constant(x_train, dtype=tf.float32),
         "labels": tf.constant(y_train, dtype=tf.int32)})
    # apply parser or pre-processing
    # if we have not scaled the data before, we can scale it here
    dataset_tr = dataset_tr.map(
        lambda d: (d['features'] / 255.0, d['labels']))
    print(dataset_tr.output_types)
    print(dataset_tr.output_shapes)


(tf.float32, tf.int32)
(TensorShape([Dimension(28), Dimension(28)]), TensorShape([]))


In [3]:
"""
Schedule an iteration procedure

When the number of samples in datasets cannot be divided by batch size,
we may consider skip some samples...
"""
skip_count = 0
shuffle_data = True
batch_size = 400
buffer_size = 10000
num_epoch = 1  # set this to None or -1 will repeat the dataset infinite times
with graph.as_default():
    if skip_count > 0:
        print('Number of {} instances skipped.'.format(skip_count))
        dataset_tr = dataset_tr.skip(skip_count)
    # shuffle
    if shuffle_data:
        dataset_tr = dataset_tr.shuffle(buffer_size)
    # make batch
    dataset_tr = dataset_tr.batch(batch_size)
    # repeat datasets for num_epoch
    dataset_tr = dataset_tr.repeat(num_epoch)
    
    """
    Define an iterator that reads an element (here, a batch) each time
    
    Several iterators are available, see this link below for more details:
    https://www.tensorflow.org/guide/datasets
    """
    iterator = dataset_tr.make_one_shot_iterator()
    # read a batch
    x_batch, y_batch = iterator.get_next()
    # This is the end of input pipeline
    print(x_batch)


Tensor("IteratorGetNext:0", shape=(?, 28, 28), dtype=float32)


In [7]:
# call a session to actually read the data
with graph.as_default():
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        xv, yv = sess.run([x_batch, y_batch])
        
        print('Shape of x_batch is {}'.format(xv.shape))
        print('Shape of y_batch is {}'.format(yv.shape))


Shape of x_batch is (400, 28, 28)
Shape of y_batch is (400,)


In [8]:
import os.path, os, warnings, imageio
from GeneralTools.misc_fun import FLAGS
from tensorflow.contrib.tensorboard.plugins import projector
import numpy as np
"""
Let's visualize the data!

A tutorial can be found at:
https://www.easy-tensorflow.com/tf-tutorials/tensorboard/tb-embedding-visualization
"""
# prepare folder
folder = 'mnist'
subfolder = 'embedding_image_raw'
filename = 'mnist'
summary_folder = os.path.join(FLAGS.DEFAULT_OUT, folder, subfolder)
if not os.path.exists(summary_folder):
    os.makedirs(summary_folder)
embedding_path = os.path.join(summary_folder, filename + '_embedding.ckpt')
sprite_path = os.path.join(summary_folder, filename + '.png')
label_path = os.path.join(summary_folder, filename + '_label.tsv')

# prepare data, sprite images and files
embedding_data = np.reshape(xv, (batch_size, -1))
images = xv  # shape [400, 28, 28]
image_size = xv.shape[1:]  # [28, 28]
labels = yv

# write label to file
if os.path.isfile(label_path):
    warnings.warn(
        'Label file {} already exists, thus this step is ignored.'.format(label_path))
else:
    metadata_file = open(label_path, 'w')
    metadata_file.write('Name\tClass\n')
    for index, label in enumerate(labels):
            metadata_file.write('%06d\t%s\n' % (index, str(label)))
    metadata_file.close()

# write images to sprite
if os.path.isfile(sprite_path):
    warnings.warn(
        'Sprite file {} already exists, thus this step is ignored.'.format(sprite_path))
else:
    # extend image shapes to [batch size, height, width, 3]
    if len(images.shape) == 3:  # if dimension of image is 3, extend it to 4
        images = np.tile(images[..., np.newaxis], (1, 1, 1, 3))
        print('Shape of images has been changed to {}'.format(images.shape))
    if images.shape[3] == 1:  # if last dimension is 1, extend it to 3
        images = np.tile(images, (1, 1, 1, 3))
        print('Shape of images has been changed to {}'.format(images.shape))
        
    # scale image to range [0,1]
    # we have done this step in pre-processing so no worries
    
    # invert images for mnist
    images = 1 - images
    
    # Tile the individual thumbnails into an image
    mesh_num = (20, 20)
    new_shape = mesh_num + images.shape[1:]  # (20, 20, 28, 28, 3)
    images = images.reshape(new_shape).transpose((0, 2, 1, 3, 4))
    print('Shape of images has been changed to {}'.format(images.shape))
    images = images.reshape(
        (mesh_num[0] * images.shape[1], mesh_num[1] * images.shape[3]) + images.shape[4:])
    print('Shape of images has been changed to {}'.format(images.shape))
    images = (images * 255).astype(np.uint8)
    # save images to file
    imageio.imwrite(sprite_path, images)

# write data to ckpt
if os.path.isfile(embedding_path):
    warnings.warn(
        'Embedding file {} already exists, thus this step is ignored.'.format(embedding_path))
else:
    # register a session
    sess = tf.Session()
    # prepare a embedding variable
    # note this must be a variable, not a tensor/constant
    embedding_var = tf.Variable(embedding_data, name='em_data')
    sess.run(embedding_var.initializer)
    # configure the embedding projector
    config = projector.ProjectorConfig()
    embedding = config.embeddings.add()
    embedding.tensor_name = embedding_var.name
    # add metadata (label) to embedding
    if label_path is not None:
        embedding.metadata_path = label_path
    # add sprite image to embedding
    if sprite_path is not None:
        embedding.sprite.image_path = sprite_path
        embedding.sprite.single_image_dim.extend(image_size)
    # finalize embedding setting
    embedding_writer = tf.summary.FileWriter(summary_folder)
    projector.visualize_embeddings(embedding_writer, config)
    embedding_saver = tf.train.Saver([embedding_var], max_to_keep=1)
    embedding_saver.save(sess, embedding_path)
    # close all
    sess.close()




In [18]:
import tensorflow as tf
import numpy as np
"""
Data pipeline

It is highly suggested to use the tf.data API to handle large datasets
    tf.data.Dataset 
        reads data from one or several source files
        does pre-processing
    tf.data.Iterator
        defines an iterator that reads a batch of data at each run

If the dataset is small, say, 200 MB, 
it is OK to load the dataset into memory (placeholder would be better).
If the dataset is large, almost 1 GB to many GBs,
it is highly encouraged to store the dataset in disk.
"""
# example 2, small dataset, placeholder
graph = tf.Graph()
with graph.as_default():
    mnist = tf.keras.datasets.mnist
    (x_train, y_train),(x_test, y_test) = mnist.load_data()
    # for this small dataset, we can do pre-processing here.
    # but for illustration purpose, we handle it later
    # x_train, x_test = x_train / 255.0, x_test / 255.0
    print('x_train has shape {}, data type {} and range {}'.format(
        x_train.shape, x_train.dtype, 
        (np.amin(x_train), np.amax(x_train))))
    print('y_train has shape {} and data type {}'.format(
        y_train.shape, y_train.dtype))
    print('The shape of x_test is {}'.format(x_test.shape))
    print('The shape of x_test is {}'.format(x_test.shape))
    
    """
    All tensors in a dataset must have the same number of samples
    float32 is often used in TF
    Input to from_tensor_slices does not have to be a dictionary; 
    tensor, tuple, list are acceptable.
    """
    features_placeholder = tf.placeholder(tf.float32, x_train.shape)
    labels_placeholder = tf.placeholder(tf.int32, y_train.shape)
    dataset_tr = tf.data.Dataset.from_tensor_slices(
        {"features": features_placeholder,
         "labels": labels_placeholder})
    # apply parser or pre-processing
    # if we have not scaled the data before, we can scale it here
    dataset_tr = dataset_tr.map(
        lambda d: (d['features'] / 255.0, d['labels']))
    print(dataset_tr.output_types)
    print(dataset_tr.output_shapes)
    
    # pre-process the data
    skip_count = 0
    shuffle_data = True
    batch_size = 400
    buffer_size = 10000
    num_epoch = 1  # set this to None or -1 will repeat the dataset infinite times
    if skip_count > 0:
        print('Number of {} instances skipped.'.format(skip_count))
        dataset_tr = dataset_tr.skip(skip_count)
    # shuffle
    if shuffle_data:
        dataset_tr = dataset_tr.shuffle(buffer_size)
    # make batch
    dataset_tr = dataset_tr.batch(batch_size)
    # repeat datasets for num_epoch
    dataset_tr = dataset_tr.repeat(num_epoch)
    
    """
    Define an iterator that reads an element (here, a batch) each time
    
    Several iterators are available, see this link below for more details:
    https://www.tensorflow.org/guide/datasets
    """
    iterator = dataset_tr.make_initializable_iterator()
    # read a batch
    x_batch, y_batch = iterator.get_next()
    # This is the end of input pipeline
    print(x_batch)
    
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        # must run initializer for iterator first that feeds in dataset
        sess.run(
            iterator.initializer, 
            feed_dict={features_placeholder: x_train, labels_placeholder: y_train})
        xv, yv = sess.run([x_batch, y_batch])
        
        print('Shape of x_batch is {}'.format(xv.shape))
        print('Shape of y_batch is {}'.format(yv.shape))


