In [70]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [73]:
import tensorflow as tf
print(tf.__version__)

2.15.0


In [78]:
import time, os, sys
import sagemaker, boto3
import numpy as np
import pandas as pd

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


In [84]:
import os
os.environ['AWS_DEFAULT_REGION']='us-east-2'

In [None]:
sess = boto3.Session()
sm   = sess.client('sagemaker', region_name='us-east-2')
# get execution role
#role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.Session(boto_session=sess)

datasets = sagemaker_session.upload_data(path='cifar10', key_prefix='dataset/cifar10-dataset')
datasets

In [91]:
from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from smexperiments.trial_component import TrialComponent

In [None]:
# change the experiment_name if needed. Experiment name has to be unique within and AWS account and AWS region
training_experiment = Experiment.create(experiment_name="sagemaker-experiments-v1", description="Experiment to track cifar10 training trials",
                                        sagemaker_boto_client=sm)

#### setup trials
l_experiment_name = traininig_experiment.experiment_name
print(l_experiment_name)

# trial name should be unique
single_gpu_trial = Trial.create(trial_name="sagemake-single-gpu-training-v1", experiment_name=training_experiment.experiment_name,
                                sagemaker_boto_client=sm,)

trial_comp_name = "single-gpu-training-job"
experiment_config = {"ExperiementName": training_epxeriment.experiment_name,
                     "TrialName": single_gpu_trial.trial_name,
                     "TrialComponentDisplayName": trial_comp_name}

In [94]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import Input, Dense, Flatten
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.optimizers import Adam, SGD
import argparse
import os
import re
import time

In [95]:
HEIGHT=32
WIDTH=32
DEPTH=3
NUM_CLASSES=10

In [109]:
def single_example_parser(serialized_example):
  """ Parse a single tf.example into image abd label tensors. """
  """ Dimensions of the images in the CIFAR dataset. """
  features = tf.io.parse_single_example(
      serialized_example,
      features = {
          'image': tf.io.FixedLenFeature([], tf.string),
          'label': tf.io.FixedLenFeature([], tf.int64),
      }
  )

  image = tf.io.decode_raw(features['image'], tf.uint8)
  image.set_shape([DEPTH*HEIGHT*WIDTH])

  # reshape from [depth * height * width] to [depth, height, width].
  image = tf.cast(tf.transpose(tf.reshape(image, [DEPTH, HEIGHT, WIDTH]), [1, 2, 0]), tf.float32)
  label = tf.cast(features['label'], tf.int32)

  image =train_preprocess_fn(image)
  label = tf.one_hot(label, NUM_CLASSES)

  return image, label

In [97]:
def train_preprocess_fn(image):
  # resize the image to add 4 extra pixels on each side.
  image = tf.image.resize_with_crop_or_pad(image, HEIGHT+8, WIDTH+8)

  # Randomly crop [HEIGHT, WIDTH] section of the image
  image = tf.image.random_crop(image, [HEIGHT, WIDTH, DEPTH])

  # Randomly flip the image horizontally.
  image = tf.image.random_flip_left_right(image)
  return image

In [107]:
def get_dataset(filename, batch_size):
  """ Read the images and labels from 'filenames' """
  # repeat infinitely.
  dataset = tf.data.TFRecordDataset(filename).repeat().shuffle(10000)

  #parse records
  dataset = dataset.map(single_example_parser, num_parallel_calls=tf.data.experimental.AUTOTUNE)

  # Batch it up
  dataset = dataset.batch(batch_size, drop_remainder=True)
  return dataset

In [115]:
def get_model(input_shape, learning_rate, weight_decay, optimizer, momentum):
    input_tensor = Input(shape=input_shape)
    base_model = keras.applications.resnet50.ResNet50(include_top=False,
                                                          weights='imagenet',
                                                          input_tensor=input_tensor,
                                                          input_shape=input_shape,
                                                          classes=None)
    x = Flatten()(base_model.output)
    predictions = Dense(NUM_CLASSES, activation='softmax')(x)
    model = Model(inputs=base_model.input, outputs=predictions)
    return model

In [None]:
def main():
  # Hyper parameters
  epochs = 10
  lr = 0.01
  batch_size = 16
  momentum = 2e-4
  weight_decay = 0.9
  optimizer = 'sgd'
  model_dir = 's3://sagemaker-us-east-2-058199717680/models/cifar10/'

  # Sagemaker options
  training_dir   = 's3://sagemaker-us-east-2-058199717680/datasets/cifar10-dataset'
  validation_dir = 's3://sagemaker-us-east-2-058199717680/datasets/cifar10-dataset'
  eval_dir       = 's3://sagemaker-us-east-2-058199717680/datasets/cifar10-dataset'

  print(f"training: {training_dir} valid: {validation_dir} | eval: {eval_dir}")
  print(f"optimizer: {optimizer}")

  train_dataset = get_dataset(training_dir+'/train.tfrecords',  batch_size)
  val_dataset   = get_dataset(validation_dir+'/validation.tfrecords', batch_size)
  eval_dataset  = get_dataset(eval_dir+'/eval.tfrecords', batch_size)

  input_shape = (HEIGHT, WIDTH, DEPTH)
  model = get_model(input_shape, lr, weight_decay, optimizer, momentum)

  # optimizer
  if optimizer.lower() == 'sgd':
    opt = tf.keras.optimizers.legacy.SGD(learning_rate=lr, decay=weight_decay, momentum=momentum)
  else:
    opt = tf.keras.optimizers.legacy.Adam(learning_rate=lr, decay=weight_decay)

  # compile the model
  model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

  # train model
  history = model.fit(train_dataset, steps_per_epoch=40000 // batch_size,
                        validation_data=val_dataset,
                        validation_steps=10000 // batch_size,
                        epochs=epochs)

  # evaluate model performance
  score = model.evaluate(eval_dataset, steps=1000//batch_size, verbose=1)
  print("test loss: ", score[0])
  print("test accuracy: ", score[1])

  # save the model to model directory
  model.save(f'{model_dir}/{time.strftime("%m%d%H%M%S", time.gmtime())}', save_format='tf')

if __name__ == "__main__":
  main()

In [118]:
from sagemaker.tensorflow import TensorFlow

In [None]:
hyperparams={'epochs': 30,
            'learning-rate': 0.01,
             'batch-size': 16,
             'weight-decay': 2e-4,
             'momentum': 0.9,
             'optmizer': 'adam'
}

bucket_name = sagemaker_session.default_bucket()
output_path = f's3://{bucket_name}/jobs'
metrics_definitions = [{'Name': 'val_acc', 'Regex': 'val_acc: ([0-9\\.]+)'}]

tf_estimator = TensorFlow(entry_point = 'cifar10-training-sagemaker.py',
                          output_path = f'{output_path}/',
                          code_location = output_path,
                          role = role,
                          train_instance_count = 1,
                          train_instance_type = 'ml.g4dn.xlarge',
                          pv_version = 'py3',
                          script_mode = True,
                          metric_definitions = metrics_definitions,
                          sagemaker_ession = sagemaker_session,
                          hyperparameters = hyperparams
                          )


job_name = f'tensorflow-single-gpu-{time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())}'
tf_estimator({'training': datasets,
              'validation': datasets,
              'eval': datasets},
             job_name = job_name,
             experiment_config = experiment_config
             )