In [None]:
!pip install sagemaker tensorflow opencv-python-headless

In [2]:
import tensorflow as tf
import sagemaker
from sagemaker.tensorflow import TensorFlow
import numpy as np
import os
import cv2
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import layers, models
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import time

print(f"TensorFlow version: {tf.__version__}")
print(f"SageMaker SDK version: {sagemaker.__version__}")

2024-02-28 06:47:00.161672: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-28 06:47:00.161755: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-28 06:47:00.163908: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-28 06:47:00.175187: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from pandas.core.computation.check import NUMEXPR_I

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
TensorFlow version: 2.15.0
SageMaker SDK version: 2.207.1


In [3]:
bucket_name = 'distributedmachinelearning'
prefix = 'distributedMLtrainingModel'
output_path = f's3://{bucket_name}/{prefix}/output'
print(output_path)

s3://distributedmachinelearning/distributedMLtrainingModel/output


In [4]:
def build_and_compile_model(num_classes):
    model = models.Sequential([
        layers.Conv2D(32, (3, 3), activation='relu', input_shape=(64, 64, 3)),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(64, (3, 3), activation='relu'),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(128, (3, 3), activation='relu'),
        layers.Flatten(),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    return model

In [5]:
hyperparameters = {
    "learning_rate": "0.001",  # Corresponds to 'eta' in gradient boosting
    "dropout_rate": "0.5",     # Dropout rate to prevent overfitting, not directly related but serves a regularization purpose similar to 'gamma'
    "batch_size": "32",        # The number of samples processed before the model is updated
    "epochs": "5",            # The number of complete passes through the training dataset
    "conv_layers": "3",        # Number of convolutional layers, similar in concept to 'max_depth' as it affects model complexity
    "filters": "64",           # Number of filters in the first Conv layer, can increase with depth
    "kernel_size": "3",        # The size of the convolutional filters
    "pool_size": "2",          # The size of the pooling window
    "dense_neurons": "128",    # The number of neurons in the dense layer after convolutional layers
    "activation": "relu",      # Activation function for the convolutional layers
    "final_activation": "softmax", # Final activation function, for binary classification it could be 'sigmoid'
    "optimizer": "adam"        # Optimization algorithm
}


In [None]:
# role = sagemaker.get_execution_role()
# estimator = TensorFlow(entry_point='test_script.py', 
#                        hyperparameters=hyperparameters,
#                        role=role,
#                        instance_count=2,
#                        instance_type='ml.m5.2xlarge',
#                        framework_version='2.3.1',
#                        py_version='py37',
#                        output_path=output_path,
#                        use_spot_instances=True,
#                        max_run=300,
#                        max_wait=600,
#                        distribution={'parameter_server': {'enabled': True}})

In [6]:
role = sagemaker.get_execution_role()

estimator = TensorFlow(entry_point='test_script.py', 
                       hyperparameters=hyperparameters,
                       role=role,
                       instance_count=1,
                       instance_type='ml.m5.2xlarge',
                       framework_version='2.3.1',
                       py_version='py37',
                       output_path=output_path,
                       use_spot_instances=True,
                       max_run=300,
                       max_wait=600)

In [7]:
default_data_path = 's3://distributedmachinelearning/FaceAll/'
input_data_path = os.environ.get('SM_CHANNEL_TRAIN', default_data_path)
os.environ['SM_CHANNEL_TRAIN'] = input_data_path
print("Input data path:", input_data_path)


Input data path: s3://distributedmachinelearning/FaceAll/


In [8]:
if 'SM_CHANNEL_TRAIN' in os.environ:
    print("SM_CHANNEL_TRAIN is set to:", os.environ['SM_CHANNEL_TRAIN'])
else:
    print("SM_CHANNEL_TRAIN is not set.")


SM_CHANNEL_TRAIN is set to: s3://distributedmachinelearning/FaceAll/


In [None]:
estimator.fit('s3://distributedmachinelearning/FaceAll')

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: tensorflow-training-2024-02-28-06-47-10-381


2024-02-28 06:47:10 Starting - Starting the training job...
2024-02-28 06:47:24 Starting - Preparing the instances for training......
2024-02-28 06:48:31 Downloading - Downloading input data......
2024-02-28 06:49:12 Training - Training image download completed. Training in progress.[34m2024-02-28 06:49:30.555810: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.[0m
[34m2024-02-28 06:49:30.555977: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.[0m
[34m2024-02-28 06:49:30.586537: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.[0m
[34m2024-02-28 06:49:31,875 sagemaker-training-toolkit INFO     Imported framework sagemaker_tensorflow_container.training[0m
[34m2024-02-28 06:49:31,883 sagemaker-training-toolkit INFO     No GPUs detected (norma