In [1]:
import os
os.chdir('..')
os.getcwd()

'/home/lucaskawazoi/development/professional/lkk-gcloud-tpus/experiment_b'

## Import libraries

In [2]:
# -------------------------------
# Import libraries
# -------------------------------
# from scripts.utils import read_label, read_image, get_training_dataset, get_validation_dataset
import os
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow.keras import callbacks, Sequential
from tensorflow.keras.layers import Reshape, Conv2D, MaxPooling2D, BatchNormalization, Dropout, Activation, Dense, Flatten
print('Tensorflow version:', tf.__version__)

default_tpu_name = os.getenv('TPU_NAME')

Tensorflow version: 1.15.0


## Detect hardware

In [3]:
# -------------------------------
# Detect hardware
# -------------------------------
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver() # TPU detection
except:
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver(default_tpu_name) # TPU detection
    except ValueError:
        tpu = None
        gpus = tf.config.experimental.list_logical_devices("GPU")

# Select appropriate distribution strategy
if tpu:
    tf.tpu.experimental.initialize_tpu_system(tpu)
    # Going back and forth between TPU and host is expensive. Better to run 128 batches on the TPU before reporting back.
    strategy = tf.distribute.experimental.TPUStrategy(tpu, steps_per_run=128)
    print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
elif len(gpus) > 1:
    strategy = tf.distribute.MirroredStrategy([gpu.name for gpu in gpus])
    print('Running on multiple GPUs ', [gpu.name for gpu in gpus])
elif len(gpus) == 1:
    # default strategy that works on CPU and single GPU
    strategy = tf.distribute.get_strategy()
    print('Running on single GPU ', gpus[0].name)
else:
    # default strategy that works on CPU and single GPU
    strategy = tf.distribute.get_strategy()
    print('Running on CPU')
print("Number of accelerators: ", strategy.num_replicas_in_sync)

Running on CPU
Number of accelerators:  1


## Set variables

In [4]:
# -------------------------------
# Set variables
# -------------------------------
BATCH_SIZE = 64 * strategy.num_replicas_in_sync
LEARNING_RATE = 0.01
LEARNING_RATE_EXP_DECAY = 0.6 if strategy.num_replicas_in_sync == 1 else 0.7

## Input function to read TFRecordDataset

In [6]:
# FILEPATHS
filenames = !gsutil ls gs://lkk-experiment_b/data/cifar-10-batches-py | grep tfrecords
train_filenames = filenames[:5]
test_filenames = filenames[5]

In [18]:
def imgs_input_fn(filenames, perform_shuffle=False, repeat_count=1, batch_size=1):
    def _parse_function(serialized):
        features = \
        {
            'height': tf.io.FixedLenFeature([], tf.int64),
            'width': tf.io.FixedLenFeature([], tf.int64),
            'depth': tf.io.FixedLenFeature([], tf.int64),
            'label': tf.io.FixedLenFeature([], tf.int64),
            'image_raw': tf.io.FixedLenFeature([], tf.string),
        }
        # Parse the serialized data so we get a dict with our data.
        parsed_example = tf.parse_single_example(serialized=serialized,
                                             features=features)
        # Get the image as raw bytes.
        image_shape = [32, 32, 3]
        image_raw = parsed_example['image_raw']
        label = tf.cast(parsed_example['label'], tf.float32)
        
        # Decode the raw bytes so it becomes a tensor with type uint8 and then cast to float32
        # Currently, only the tf.float32, tf.int32, tf.bfloat16, and tf.bool data types are supported on the TPU.
        image = tf.decode_raw(image_raw, tf.uint8)
        image = tf.cast(image, tf.float32)
        image = tf.reshape(image, image_shape)       
        image = tf.subtract(image, 116.779) # Zero-center by mean pixel
        image = tf.reverse(image, axis=[2]) # 'RGB'->'BGR'
        d = dict(zip(['image_input'],[image])), [label]
        return d
    
    dataset = tf.data.TFRecordDataset(filenames=filenames)
    # Parse the serialized data in the TFRecords files.
    # This returns TensorFlow tensors for the image and labels.
    dataset = dataset.map(_parse_function)
    if perform_shuffle:
        # Randomizes input using a window of 256 elements (read into memory)
        dataset = dataset.shuffle(buffer_size=256)
    dataset = dataset.repeat(repeat_count)  # Repeats dataset this # times
    dataset = dataset.batch(batch_size)  # Batch size to use
    iterator = dataset.make_one_shot_iterator()
    batch_features, batch_labels = iterator.get_next()
    return batch_features, batch_labels

imgs_input_fn(train_filenames)

({'image_input': <tf.Tensor 'IteratorGetNext_2:0' shape=(?, 32, 32, 3) dtype=float32>},
 <tf.Tensor 'IteratorGetNext_2:1' shape=(?, 1) dtype=float32>)

## Start tf.Session and next batch to check the data

Obs.: The steps below change the pixel type, value and order, so in order to see the images, we need to change them back
    - image = tf.cast(image, tf.float32)
    - image = tf.subtract(image, 116.779) # Zero-center by mean pixel
    - image = tf.reverse(image, axis=[2]) # 'RGB'->'BGR'

In [19]:
# Check data
next_batch = imgs_input_fn(test_filenames, perform_shuffle=True, batch_size=20)
with tf.Session() as sess:
    first_batch = sess.run(next_batch)

In [21]:
print('Batch type (should be a tuple):', type(first_batch))
print(first_batch[0].keys())
print('Shape of images batch (n, height, width, channels):', first_batch[0]['image_input'].shape)
print('Shape of labels batch (n, 1)', first_batch[1].shape)

Batch type (should be a tuple): <class 'tuple'>
dict_keys(['image_input'])
Shape of images batch (n, height, width, channels): (20, 32, 32, 3)
Shape of labels batch (n, 1) (20, 1)


## Create a model

In [5]:
# -------------------------------
# Make model
# -------------------------------
def make_model():
    model = Sequential([
        Reshape(input_shape=(32*32*3,), target_shape=(32, 32, 3), name='image'),

        Conv2D(filters=12, kernel_size=3, padding='same', use_bias=False),
        BatchNormalization(scale=False, center=True),
        Activation('relu'),

        Conv2D(filters=24, kernel_size=6, padding='same',
               use_bias=False, strides=2),
        BatchNormalization(scale=False, center=True),
        Activation('relu'),

        Conv2D(filters=36, kernel_size=6, padding='same',
               use_bias=False, strides=2),
        BatchNormalization(scale=False, center=True),
        Activation('relu'),

        Flatten(),
        Dense(2000, use_bias=False),
        BatchNormalization(scale=False, center=True),
        Activation('relu'),
        Dropout(0.4),
        Dense(10, activation='softmax')
    ])

    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    return model

with strategy.scope():
    model = make_model()

model.summary()

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
image (Reshape)              (None, 32, 32, 3)         0         
_________________________________________________________________
conv2d (Conv2D)              (None, 32, 32, 12)        324       
_________________________________________________________________
batch_normalization (BatchNo (None, 32, 32, 12)        36        
_________________________________________________________________
activation (Activation)      (None, 32, 32, 12)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 16, 16, 24)        10368     
_________________________________________________________________
batch_normalization_1 (Batch (None, 16, 16, 24)        72        
_________________________________________

## Create estimator

In [24]:
from pathlib import Path

model_dir = Path() / 'models'
model_dir.mkdir(exist_ok=True)
estimator = tf.keras.estimator.model_to_estimator(keras_model=model,
                                                    model_dir=model_dir)
estimator

INFO:tensorflow:Using default config.
INFO:tensorflow:Using the Keras model provided.
INFO:tensorflow:Using config: {'_model_dir': 'models', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f99281479e8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


<tensorflow_estimator.python.estimator.estimator.Estimator at 0x7f99282aebe0>

In [25]:
train_spec = tf.estimator.TrainSpec(input_fn=lambda: imgs_input_fn(train_filenames,
                                                                   perform_shuffle=True,
                                                                   repeat_count=5,
                                                                   batch_size=20), 
                                    max_steps=500)

eval_spec = tf.estimator.EvalSpec(input_fn=lambda: imgs_input_fn(test_filenames,
                                                                 perform_shuffle=False,
                                                                 batch_size=1))

In [26]:
tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

TypeError: unsupported callable

In [220]:
# -------------------------------
# Learning rate decay
# -------------------------------
lr_decay = callbacks.LearningRateScheduler(
    lambda epoch: LEARNING_RATE * LEARNING_RATE_EXP_DECAY**epoch,
    verbose=True)

In [221]:
# -------------------------------
# MODEL.FIT
# -------------------------------
EPOCHS = 10
steps_per_epoch = 60000 // BATCH_SIZE
history = model.fit(imgs_input_fn,
                    steps_per_epoch=steps_per_epoch,
                    epochs=EPOCHS,
                    callbacks=[lr_decay]
                    )

AttributeError: 'function' object has no attribute 'shape'

In [50]:
final_stats = model.evaluate(validation_dataset, steps=1)
print('Validation accuracy: ',final_stats[1])



ValueError: No data provided for "image_input". Need data for each key in: ['image_input']

In [1]:
import tensorflow as tf
print("Tensorflow version " + tf.__version__)

Tensorflow version 1.15.0


In [2]:
# Detect hardware
try:
  tpu = tf.distribute.cluster_resolver.TPUClusterResolver() # TPU detection
except ValueError:
  tpu = None
  gpus = tf.config.experimental.list_logical_devices("GPU")
    
# Select appropriate distribution strategy
if tpu:
  tf.tpu.experimental.initialize_tpu_system(tpu)
  strategy = tf.distribute.experimental.TPUStrategy(tpu, steps_per_run=128) # Going back and forth between TPU and host is expensive. Better to run 128 batches on the TPU before reporting back.
  print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])  
elif len(gpus) > 1:
  strategy = tf.distribute.MirroredStrategy([gpu.name for gpu in gpus])
  print('Running on multiple GPUs ', [gpu.name for gpu in gpus])
elif len(gpus) == 1:
  strategy = tf.distribute.get_strategy() # default strategy that works on CPU and single GPU
  print('Running on single GPU ', gpus[0].name)
else:
  strategy = tf.distribute.get_strategy() # default strategy that works on CPU and single GPU
  print('Running on CPU')
print("Number of accelerators: ", strategy.num_replicas_in_sync)

Running on CPU
Number of accelerators:  1


In [12]:
BATCH_SIZE = 64 * strategy.num_replicas_in_sync # Gobal batch size.
# The global batch size will be automatically sharded across all
# replicas by the tf.data.Dataset API. A single TPU has 8 cores.
# The best practice is to scale the batch size by the number of
# replicas (cores). The learning rate should be increased as well.

LEARNING_RATE = 0.01
LEARNING_RATE_EXP_DECAY = 0.6 if strategy.num_replicas_in_sync == 1 else 0.7
# Learning rate computed later as LEARNING_RATE * LEARNING_RATE_EXP_DECAY**epoch
# 0.7 decay instead of 0.6 means a slower decay, i.e. a faster learnign rate.

In [13]:
tfrecords = !gsutil ls gs://lkk-experiment_b/data/cifar-10-batches-py | grep tfrecords
tfrecords

['gs://lkk-experiment_b/data/cifar-10-batches-py/data_batch_1.tfrecords',
 'gs://lkk-experiment_b/data/cifar-10-batches-py/data_batch_2.tfrecords',
 'gs://lkk-experiment_b/data/cifar-10-batches-py/data_batch_3.tfrecords',
 'gs://lkk-experiment_b/data/cifar-10-batches-py/data_batch_4.tfrecords',
 'gs://lkk-experiment_b/data/cifar-10-batches-py/data_batch_5.tfrecords',
 'gs://lkk-experiment_b/data/cifar-10-batches-py/test_batch.tfrecords']

In [None]:
# raw_image_dataset = tf.data.TFRecordDataset('images.tfrecords')

# # Create a dictionary describing the features.
# image_feature_description = {
#     'height': tf.io.FixedLenFeature([], tf.int64),
#     'width': tf.io.FixedLenFeature([], tf.int64),
#     'depth': tf.io.FixedLenFeature([], tf.int64),
#     'label': tf.io.FixedLenFeature([], tf.int64),
#     'image_raw': tf.io.FixedLenFeature([], tf.string),
# }

# def _parse_image_function(example_proto):
#   # Parse the input tf.Example proto using the dictionary above.
#   return tf.io.parse_single_example(example_proto, image_feature_description)

# parsed_image_dataset = raw_image_dataset.map(_parse_image_function)
# parsed_image_dataset

In [17]:
training_dataset = tf.data.TFRecordDataset(tfrecords[0])
training_dataset

<TFRecordDatasetV1 shapes: (), types: tf.string>

In [4]:
def read_label(tf_bytestring):
    label = tf.io.decode_raw(tf_bytestring, tf.uint8)
    label = tf.reshape(label, [])
    label = tf.one_hot(label, 10)
    return label
  
def read_image(tf_bytestring):
    image = tf.io.decode_raw(tf_bytestring, tf.uint8)
    image = tf.cast(image, tf.float32)/255.0
    image = tf.reshape(image, [28*28])
    return image
  
def load_dataset(image_file, label_file):
    imagedataset = tf.data.FixedLengthRecordDataset(image_file, 28*28, header_bytes=16)
    imagedataset = imagedataset.map(read_image, num_parallel_calls=16)
    labelsdataset = tf.data.FixedLengthRecordDataset(label_file, 1, header_bytes=8)
    labelsdataset = labelsdataset.map(read_label, num_parallel_calls=16)
    dataset = tf.data.Dataset.zip((imagedataset, labelsdataset))
    return dataset 
  
def get_training_dataset(image_file, label_file, batch_size):
    dataset = load_dataset(image_file, label_file)
    dataset = dataset.cache()  # this small dataset can be entirely cached in RAM
    dataset = dataset.shuffle(5000, reshuffle_each_iteration=True)
    dataset = dataset.repeat() # Mandatory for Keras for now
    dataset = dataset.batch(batch_size, drop_remainder=True) # drop_remainder is important on TPU, batch size must be fixed
    dataset = dataset.prefetch(-1)  # fetch next batches while training on the current one (-1: autotune prefetch buffer size)
    return dataset
  
def get_validation_dataset(image_file, label_file):
    dataset = load_dataset(image_file, label_file)
    dataset = dataset.cache() # this small dataset can be entirely cached in RAM
    dataset = dataset.batch(10000, drop_remainder=True) # 10000 items in eval dataset, all in one batch
    dataset = dataset.repeat() # Mandatory for Keras for now
    return dataset

# instantiate the datasets
training_dataset = get_training_dataset(training_images_file, training_labels_file, BATCH_SIZE)
validation_dataset = get_validation_dataset(validation_images_file, validation_labels_file)

In [22]:
# This model trains to 99.4% accuracy in 10 epochs (with a batch size of 64)  

def make_model():
    model = tf.keras.Sequential(
      [
#         tf.keras.layers.Reshape(input_shape=(32*32,), target_shape=(32, 32, 1), name="image"),

        tf.keras.layers.Conv2D(filters=12, kernel_size=3, padding='same', use_bias=False), # no bias necessary before batch norm
        tf.keras.layers.BatchNormalization(scale=False, center=True), # no batch norm scaling necessary before "relu"
        tf.keras.layers.Activation('relu'), # activation after batch norm

        tf.keras.layers.Conv2D(filters=24, kernel_size=6, padding='same', use_bias=False, strides=2),
        tf.keras.layers.BatchNormalization(scale=False, center=True),
        tf.keras.layers.Activation('relu'),

        tf.keras.layers.Conv2D(filters=32, kernel_size=6, padding='same', use_bias=False, strides=2),
        tf.keras.layers.BatchNormalization(scale=False, center=True),
        tf.keras.layers.Activation('relu'),

        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(200, use_bias=False),
        tf.keras.layers.BatchNormalization(scale=False, center=True),
        tf.keras.layers.Activation('relu'),
        tf.keras.layers.Dropout(0.4), # Dropout on dense layer only

        tf.keras.layers.Dense(10, activation='softmax')
      ])

    model.compile(optimizer='adam', # learning rate will be set by LearningRateScheduler
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model
    
with strategy.scope():
    model = make_model()

In [27]:
# print model layers
model.summary()

ValueError: This model has not yet been built. Build the model first by calling `build()` or calling `fit()` with some data, or specify an `input_shape` argument in the first layer(s) for automatic build.

In [18]:
# set up learning rate decay
lr_decay = tf.keras.callbacks.LearningRateScheduler(
    lambda epoch: LEARNING_RATE * LEARNING_RATE_EXP_DECAY**epoch,
    verbose=True)

ValueError: This model has not yet been built. Build the model first by calling `build()` or calling `fit()` with some data, or specify an `input_shape` argument in the first layer(s) for automatic build.

In [6]:
EPOCHS = 10
steps_per_epoch = 60000//BATCH_SIZE  # 60,000 items in this dataset
print("Steps per epoch: ", steps_per_epoch)
  
# Little wrinkle: in the present version of Tensorfow (1.14), switching a TPU
# between training and evaluation is slow (approx. 10 sec). For small models,
# it is recommeneded to run a single eval at the end.
history = model.fit(training_dataset,
                    steps_per_epoch=steps_per_epoch, epochs=EPOCHS,
                    callbacks=[lr_decay])

final_stats = model.evaluate(validation_dataset, steps=1)
print("Validation accuracy: ", final_stats[1])

Steps per epoch:  937
Train for 937 steps

Epoch 00001: LearningRateScheduler reducing learning rate to 0.01.
Epoch 1/10

Epoch 00002: LearningRateScheduler reducing learning rate to 0.006.
Epoch 2/10

Epoch 00003: LearningRateScheduler reducing learning rate to 0.0036.
Epoch 3/10

Epoch 00004: LearningRateScheduler reducing learning rate to 0.0021599999999999996.
Epoch 4/10

Epoch 00005: LearningRateScheduler reducing learning rate to 0.001296.
Epoch 5/10

Epoch 00006: LearningRateScheduler reducing learning rate to 0.0007775999999999998.
Epoch 6/10

Epoch 00007: LearningRateScheduler reducing learning rate to 0.0004665599999999999.
Epoch 7/10

Epoch 00008: LearningRateScheduler reducing learning rate to 0.00027993599999999994.
Epoch 8/10

Epoch 00009: LearningRateScheduler reducing learning rate to 0.00016796159999999993.
Epoch 9/10

Epoch 00010: LearningRateScheduler reducing learning rate to 0.00010077695999999997.
Epoch 10/10
Validation accuracy:  0.9952


In [7]:
import os
os.chdir('..')

In [8]:
os.getcwd()

'/home/lucaskawazoi/development/professional/lkk-gcloud-tpus/experiment_a'

In [9]:
from scripts.utils import _images, _labels
from pathlib import Path
from datetime import datetime
import numpy as np

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dense, Flatten
from tensorflow.keras.optimizers import Adam

print('Hello world', tf.__version__)

Hello world 2.0.0


In [10]:
# Detect hardware
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
except ValueError:
    tpu = None
    gpus = tf.config.experimental.list_logical_devices("GPU")

# Select appropriate distribution strategy
if tpu:
    tf.tpu.experimental.initialize_tpu_system(tpu)
    # Going back and forth between TPU and host is expensive. Better to run 128 batches on the TPU before reporting back.
    strategy = tf.distribute.experimental.TPUStrategy(tpu, steps_per_run=128)
    print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
elif len(gpus) > 1:
    strategy = tf.distribute.MirroredStrategy([gpu.name for gpu in gpus])
    print('Running on multiple GPUs ', [gpu.name for gpu in gpus])
elif len(gpus) == 1:
    # default strategy that works on CPU and single GPU
    strategy = tf.distribute.get_strategy()
    print('Running on single GPU ', gpus[0].name)
else:
    # default strategy that works on CPU and single GPU
    strategy = tf.distribute.get_strategy()
    print('Running on CPU')
print("Number of accelerators: ", strategy.num_replicas_in_sync)

Running on CPU
Number of accelerators:  1


In [11]:
# Setup input pipeline
# DATA_PATH = 'gs://lkk-experiment_a/data'
DATA_PATH = Path()/'data'
TRAIN_IMAGES_PATH = DATA_PATH / 'train-images-idx3-ubyte.gz'
TRAIN_LABELS_PATH = DATA_PATH / 'train-labels-idx1-ubyte.gz'
TEST_IMAGES_PATH = DATA_PATH / 't10k-images-idx3-ubyte.gz'
TEST_LABELS_PATH = DATA_PATH / 't10k-labels-idx1-ubyte.gz'
bs = 32

images = _images(TRAIN_IMAGES_PATH)
labels = _labels(TRAIN_LABELS_PATH)
dataset = tf.data.Dataset.from_tensor_slices((images, labels)).batch(bs)

# Create model
with strategy.scope():
    model = Sequential([
        Conv2D(32, 3, activation='relu', input_shape=(28, 28, 1)),
        MaxPooling2D(),
        Flatten(),
        Dense(64, activation='relu'),
        Dense(10, activation='softmax')
    ])

    model.compile(loss='mse',
                  optimizer=Adam(),
                  metrics=['accuracy']
                  )

# print model layers
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 26, 26, 32)        320       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 13, 13, 32)        0         
_________________________________________________________________
flatten (Flatten)            (None, 5408)              0         
_________________________________________________________________
dense (Dense)                (None, 64)                346176    
_________________________________________________________________
dense_1 (Dense)              (None, 10)                650       
Total params: 347,146
Trainable params: 347,146
Non-trainable params: 0
_________________________________________________________________


In [12]:
# Fit model
model.fit(dataset, epochs=2)

# Save model
MODELS_PATH = Path() / 'models'
MODELS_PATH.mkdir(parents=True, exist_ok=True)
now = datetime.now().strftime("%Y%m%d_%H%M%S")
fname = MODELS_PATH / f'{now}_tfmodels'
model.save(str(fname.absolute()), save_format='tf')

Epoch 1/2
    236/Unknown - 2s 9ms/step - loss: 0.0224 - accuracy: 0.8528

KeyboardInterrupt: 