In [None]:
pip install sagemaker

In [None]:
# SEBTAC's FINDINGs

OBJECTIVE:
    - DISTRIBUTED TENSORFLOW on AWS
    - DISTRIBUTED INPUT AND TRAINING
    - DATA INPUT MODES: "Pipe", "File", "FastFile"
    - TRAIN PARALLELIZATION APPROACH: "TF - Mirrored Strategy", "SMD - Sage Maker Distributed", "MPI"
    - DATA PARALLELIZATION APPROACH: data sharding (4 and 120 Shards)
    - DATA SOURCE: S3, FSX for Lustre

BEST:
    - with "MPI"
    - 8 GPUs x 1 Instance
    - 120 shards on input,
    - "FastFile",
    - "ShardedByS3Key" -- although small difference from "FullyReplicated"
    - with bigger datasets, there should be potential for "FSX for Lustre"
    
FINDINGS:
    - Earlier tests showed benefit of training on GPUs over CPUs
    - Training on 8 GPU instances speeds up training 4x over 1 GPU instance. Other tests sugest speeds up in range of 6x thus:
        NEXT STEP: test if the 4x speedup is just a result of a stronger GPU instance and the traiing happens on only 1 GPU out of 8 available on the EC2 Instance 
    - FSX for Lustre... slows down training
        - it might be due to small file sizes
            - NEXT STEP - test FSX for Lustre with larger files
    - DATA SHARDS
        - 120 Shards better then 4 Shard
            - training times comperable
            - the fit higher with training onthe same amount of data and within the same time
                - potentially, sharding offers additional layer of randomized sampling which has positive impact on training
            - this does not mean that increasing nnumber of shards indifinatively will improve training!!!
                - we will loose benefits of batching and incrrease communication and control overhead
    - "FAST FILE" has advantege over "FILE" and "PIPE" modes
    - "FullyReplicated" has advantage over "ShardedByS3Key" #(distribution = )
        - Small Input File size allows to load complete data on the instance and speed up initialization 
    - "SMD" 
        - Slowest!
        - Failed with mulit-instsance setup
    - "MPI" 
        - Fastest!
        - Works with Multi-Instance setup!!!
            

In [None]:
##################################
### "FastFile", "Pipe", "File" ###
##################################

import sagemaker
from sagemaker import get_execution_role
from sagemaker.tensorflow import TensorFlow

hyperparameters = {"input_dist_mode": "Pipe", #"FastFile", "Pipe", "File"
                   "shards_on_input": 120, # 4, 120 (put proper shards in shard-specific subfolders in S3 first!)
                   "epochs": 3, # 3: Standard Tests, 25: With Early Stopping
                   "download_raw_data": 1, # ShouLd we create TFRECORD files with MINIST data?
                   "train_dist_mode": "TF", # "TF", "SMD", "MPI"
                  }

train_input = sagemaker.inputs.TrainingInput("s3://mnist-tdrecords/train/{}".format(hyperparameters["shards_on_input"]),  
                                             input_mode = hyperparameters["input_dist_mode"],
                                             distribution = 'ShardedByS3Key' #'ShardedByS3Key', 'FullyReplicated'
                                            )

tf_estimator = TensorFlow(entry_point = "AWS_DataPipping_TFMirroredStrategy.py",
                          source_dir = "./",
                          framework_version = "2.3",
                          py_version = "py37",
                          instance_type = "ml.p3.2xlarge", # "ml.p3.2xlarge", "ml.p3.8xlarge", "ml.p3.16xlarge"
                          instance_count = 1,
                          role = sagemaker.get_execution_role(),
                          hyperparameters = hyperparameters,
                          output_path = f"s3://mnist-tdrecords/output",
                          input_mode = hyperparameters["input_dist_mode"], # "File", "Pipe", "FastFile")
                         )

s3_data_channels = {"train": train_input}
#s3_data_channels = {"train": "s3://mnist-tdrecords/train/{}".format(hyperparameters["shards_on_input"])}
                    #"validation": f"s3://{bucket_name}/data/validation",}

tf_estimator.fit(s3_data_channels)

##################################################################################################################################################################################
######## IF SUDDENLY NO TRAINING HAPPENS OR IT DOES VERY QUICKLY DELETE THE RESULTS OF THE PREVIOUS RUNS AS LIST_OBJECTS FUNCTION HAS A LIMIT ON THE LENGTH OF THE OUTPUT ########
##################################################################################################################################################################################

In [None]:
print(train_input)

In [None]:
# PIPE MODE
1 GPU, 4 shards on input, FullyReplicated
Epoch 3, Loss: 0.09680283069610596, Train Accuracy: 96.4366683959961, Test Accuracy: 0.0
TIME 279.08890867233276

1 GPU, 120 shards on input, FullyReplicated
Epoch 3, Loss: 0.004249399993568659, Train Accuracy: 99.88666534423828, Test Accuracy: 0.0
TIME 278.04257345199585

1 GPU, 120 shards on input, ShardedByS3Key
Epoch 3, Loss: 0.0002816336345858872, Train Accuracy: 99.99166870117188, Test Accuracy: 77.75
TIME 288.7442648410797

8 GPU, 4 shards on input, FullyReplicated
Epoch 3, Loss: 0.35395368933677673, Train Accuracy: 86.75333404541016, Test Accuracy: 0.0
TIME 71.81656908988953

8 GPU, 120 shards on input, FullyReplicated
Epoch 3, Loss: 0.0014360351487994194, Train Accuracy: 99.98833465576172, Test Accuracy: 0.0
TIME 72.16402530670166

8 GPU, 120 shards on input, ShardedByS3Key
Epoch 3, Loss: 0.003127657575532794, Train Accuracy: 99.9766616821289, Test Accuracy: 77.45000457763672
TIME 75.63692235946655

In [None]:
# FastFile

1 GPU, 4 shards on input, FullyReplicated
Epoch 3, Loss: 0.10208243876695633, Train Accuracy: 96.2316665649414, Test Accuracy: 0.0
TIME 275.73823285102844

1 GPU, 120 shards on input, FullyReplicated
Epoch 3, Loss: 0.007705101743340492, Train Accuracy: 99.80166625976562, Test Accuracy: 0.0
TIME 277.03554940223694

8 GPU, 4 shards on input, FullyReplicated
Epoch 3, Loss: 0.46059784293174744, Train Accuracy: 82.9566650390625, Test Accuracy: 0.0
TIME 68.45022320747375

8 GPU, 120 shards on input, FullyReplicated
Epoch 3, Loss: 0.04005113244056702, Train Accuracy: 98.80333709716797, Test Accuracy: 0.0
TIME 68.02924108505249

8 GPU, 120 shards on input, ShardedByS3Key
Epoch 3, Loss: 0.11077616363763809, Train Accuracy: 96.31666564941406, Test Accuracy: 0.0
TIME 68.31658935546875

In [None]:
##################################
######### FSX for Lustre #########
##################################

hyperparameters = {"input_dist_mode": "File", #"FastFile", "Pipe", "File" -- FOR FSx "File" ONLY!!!
                   "shards_on_input": 120, # 4, 120 -- Shards in S3
                   "epochs": 25, # 3: Standard Tests, 25: With Early Stopping
                   "download_raw_data": 1, # ShouLd we create TFRECORD files with MINIST data?
                   "train_dist_mode": "TF", # "TF", "SMD", "MPI"
                  }

from sagemaker.debugger import ProfilerConfig, FrameworkProfile

profiler_config = ProfilerConfig(framework_profile_params=FrameworkProfile(start_step=2, num_steps=7))

from sagemaker.inputs import FileSystemInput

# Specify file system id.
file_system_id = "fs-061783acdcbd8da72"

# Specify directory path associated with the file system. You need to provide normalized and absolute path here.
file_system_directory_path = "/gllyrbev/Train_{}_Shards".format(hyperparameters["shards_on_input"])
    
# Specify the access mode of the mount of the directory associated with the file system.
# Directory can be mounted either in 'ro'(read-only) or 'rw' (read-write).
file_system_access_mode = "rw"

# Specify your file system type, "EFS" or "FSxLustre".
file_system_type = "FSxLustre"

# Give Amazon SageMaker Training Jobs Access to FileSystem Resources in Your Amazon VPC.
security_groups_ids = ["sg-36929710"]
subnets = ["subnet-006ebd66"]

fs_train_input = FileSystemInput(file_system_id=file_system_id,
                                 file_system_type=file_system_type,
                                 directory_path=file_system_directory_path,
                                 file_system_access_mode=file_system_access_mode)

import sagemaker
from sagemaker import get_execution_role
from sagemaker.tensorflow import TensorFlow

train_input = sagemaker.inputs.TrainingInput("s3://mnist-tdrecords/train/{}".format(hyperparameters["shards_on_input"]),  
                                             input_mode = hyperparameters["input_dist_mode"],
                                             distribution = 'FullyReplicated' #'ShardedByS3Key', 'FullyReplicated'
                                            )

tf_estimator = TensorFlow(entry_point = "AWS_DataPipping_TFMirroredStrategy.py",
                          source_dir = "./",
                          framework_version = "2.3",
                          py_version = "py37",
                          instance_type = "ml.p3.16xlarge", # "ml.p3.2xlarge", "ml.p3.8xlarge", "ml.p3.16xlarge"
                          instance_count = 1,
                          role = sagemaker.get_execution_role(),
                          subnets=subnets,
                          security_group_ids=security_groups_ids,
                          hyperparameters = hyperparameters,
                          output_path = f"s3://mnist-tdrecords/output",                          
                          input_mode = hyperparameters["input_dist_mode"], # "File", "Pipe", "FastFile")
                          profiler_config=profiler_config,
                         )

s3_data_channels = {"train": fs_train_input}
#s3_data_channels = {"train": "s3://mnist-tdrecords/train/{}".format(hyperparameters["shards_on_input"])}
                    #"validation": f"s3://{bucket_name}/data/validation",}

tf_estimator.fit(s3_data_channels)

In [None]:
# FSx for LUSTRE

1 GPU, 4 shards on input, FullyReplicated
Epoch 3, Loss: 0.10235357284545898, Train Accuracy: 96.22332763671875, Test Accuracy: 0.0
TIME 285.3839440345764

1 GPU, 120 shards on input, FullyReplicated
Epoch 3, Loss: 0.0075263879261910915, Train Accuracy: 99.82499694824219, Test Accuracy: 0.0
TIME 282.914315700531

8 GPU, 4 shards on input, FullyReplicated
Epoch 3, Loss: 0.47831735014915466, Train Accuracy: 82.04833221435547, Test Accuracy: 0.0
TIME 121.28670692443848

8 GPU, 120 shards on input, FullyReplicated
Epoch 3, Loss: 0.017312103882431984, Train Accuracy: 99.54166412353516, Test Accuracy: 0.0
TIME 119.6302547454834

In [3]:
###################################
####### SM_Distirbuted, MPI #######
###################################

import sagemaker
from sagemaker import get_execution_role
from sagemaker.tensorflow import TensorFlow

hyperparameters = {"input_dist_mode": "FastFile", #"FastFile", "Pipe", "File"
                   "shards_on_input": 120, # 4, 120 (for inbetween created subfolders in S3 first
                   "train_dist_mode": "MPI", #"TF", "SMD", "MPI"
                   "epochs": 3, # 3: Standard Tests, 25: With Early Stopping
                   "download_raw_data": 1, # ShouLd we create TFRECORD files with MINIST data?
                  }

train_input = sagemaker.inputs.TrainingInput("s3://mnist-tdrecords/train/{}".format(hyperparameters["shards_on_input"]),  
                                             input_mode = hyperparameters["input_dist_mode"],
                                             distribution = 'FullyReplicated' #'ShardedByS3Key', 'FullyReplicated'
                                            )

# Training using SMDataParallel Distributed Training Framework

if hyperparameters["train_dist_mode"] == "MPI": # @AWS SaagMaker: MPI==HOROVOD!!!
    distribution = {"mpi": {"enabled": True,
                            "processes_per_host": 8,
                            'custom_mpi_options': '-verbose --NCCL_DEBUG=INFO -x OMPI_MCA_btl_vader_single_copy_mechanism=none'}}

elif hyperparameters["train_dist_mode"] == "SMD":
    distribution={"smdistributed": {"dataparallel": {"enabled": True,}}}
    
# SMD as is runs 8 processes on each instance
# Also works with #"processes_per_host": 1,}}} and 8 processes per insrtance are executed too.
# with "processes_per_host": 8 errors pop up - probabaly due to multiple processes being executed on each GPU
# with MPI the error is realted to the fact that code in the .py file uses smdistributed which creates conflict with MPI

tf_estimator = TensorFlow(entry_point = "AWS_DataPipping_TFMirroredStrategy.py",
                          source_dir = "./",
                          framework_version = "2.3",
                          py_version = "py37",
                          instance_type = "ml.p3.16xlarge", # "ml.p3.2xlarge", "ml.p3.8xlarge", "ml.p3.16xlarge", "local_gpu"
                          instance_count = 1, # 2+ DOES NOT WORK ??!?!?!?!?
                          role = sagemaker.get_execution_role(),
                          hyperparameters = hyperparameters,
                          output_path = f"s3://mnist-tdrecords/output",
                          input_mode = hyperparameters["input_dist_mode"], # "File", "Pipe", "FastFile")
                          distribution = distribution,
                         )

s3_data_channels = {"train": train_input}
#s3_data_channels = {"train": "s3://mnist-tdrecords/train/{}".format(hyperparameters["shards_on_input"])}
                    #"validation": f"s3://{bucket_name}/data/validation",}

tf_estimator.fit(s3_data_channels)

##########################################################################################################################
######## IF SUDDENLY NO TRAINING HAPPENS OR IT DOES VERY QUICKLY, ######################################################## 
######## DELETE THE RESULTS OF THE PREVIOUS RUNS AS LIST_OBJECTS FUNCTION HAS A LIMIT ON THE LENGTH OF THE OUTPUT ########
##########################################################################################################################

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.


Using provided s3_resource


INFO:sagemaker:Creating training-job with name: tensorflow-training-2023-05-17-14-49-51-884


2023-05-17 14:49:52 Starting - Starting the training job......
2023-05-17 14:50:48 Starting - Preparing the instances for training.........
2023-05-17 14:52:06 Downloading - Downloading input data...
2023-05-17 14:52:26 Training - Downloading the training image.........
2023-05-17 14:53:57 Training - Training image download completed. Training in progress..[34m2023-05-17 14:54:26.923652: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.[0m
[34m2023-05-17 14:54:26.923855: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.[0m
[34m2023-05-17 14:54:26.931398: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.2[0m
[34m2023-05-17 14:54:26.970428: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker 

UnexpectedStatusException: Error for Training job tensorflow-training-2023-05-17-14-49-51-884: Failed. Reason: AlgorithmError: ExecuteUserScriptError:
Command "mpirun --host algo-1:8 -np 8 --allow-run-as-root --display-map --tag-output -mca btl_tcp_if_include eth0 -mca oob_tcp_if_include eth0 -mca plm_rsh_no_tree_spawn 1 -bind-to none -map-by slot -mca pml ob1 -mca btl ^openib -mca orte_abort_on_non_zero_status 1 -mca btl_vader_single_copy_mechanism none -x NCCL_MIN_NRINGS=4 -x NCCL_SOCKET_IFNAME=eth0 -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH -x PATH -x LD_PRELOAD=/usr/local/lib/python3.7/site-packages/gethostname.cpython-37m-x86_64-linux-gnu.so -verbose -x OMPI_MCA_btl_vader_single_copy_mechanism=none -x SM_HOSTS -x SM_NETWORK_INTERFACE_NAME -x SM_HPS -x SM_USER_ENTRY_POINT -x SM_FRAMEWORK_PARAMS -x SM_RESOURCE_CONFIG -x SM_INPUT_DATA_CONFIG -x SM_OUTPUT_DATA_DIR -x SM_CHANNELS -x SM_CURRENT_HOST -x SM_MODULE_NAME -x SM_LOG_LEVEL -x SM_FRAMEWORK_MODULE -x SM_INPUT_DIR -x SM_INPUT_CONFIG_DIR -x SM_OUTPUT_DIR -x SM_NUM_CPUS -x SM_NUM_GPUS -x SM_MODEL_DIR -x SM_MODULE_DIR -x SM_TRAINING_ENV -x SM_USER_ARGS -x SM_OUTPUT_INTE

In [None]:
(60000//512), (60000- 512*117)/64

In [None]:
###############################
########### SMD ###############
###############################

8 GPU, 4 shards on input, FastFile, FullyReplicated, 1 Instance
OVERWRITTEN with MPI but comperable with the rest of SMD

8 GPU, 120 shards on input, FastFile, FullyReplicated, 1 Instance
[1,0]<stdout>:Epoch 3, Loss: 0.0011306742671877146, # Train Accuracy: 0.0, Test Accuracy: 0.0
[1,0]<stdout>:Training Time 193.52111554145813

8 GPU, 120 shards on input, FastFile, ShardedByS3Key, 1 Instance
[1,0]<stdout>:Epoch 3, Loss: 0.0017849248833954334, # Train Accuracy: 0.0, Test Accuracy: 0.0
[1,0]<stdout>:Training Time 195.88474798202515


8 GPU, 120 shards on input, Pipe, FullyReplicated, 1 Instance
FAILED with error: "tensorflow.python.framework.errors_impl.InternalError: CRC check on header failed. [Op:IteratorGetNext]"


# MULTIPLE INSTANCES DID NOT WORK - ??!?!?!??!
8 GPU x 2 Instances, 120 shards on input, FastFile, FullyReplicated
???

8 GPU x 2 Instances, 120 shards on input, FastFile, ShardedByS3Key, 2 Instances
???

In [None]:
###############################
########### MPI ###############
###############################

8 GPU, 4 shards on input, FastFile, FullyReplicated, 1 Instance
[1,0]<stdout>:  1/117 - 0s - loss: 0.4659 - accuracy: 0.7917 - batch: 1.0000
[1,0]<stdout>:RUN TIME 62.51244115829468

8 GPU, 120 shards on input, FastFile, FullyReplicated, 1 Instance
[1,0]<stdout>:  1/117 - 0s - loss: 0.0704 - accuracy: 0.9896 - batch: 1.0000
[1,0]<stdout>:RUN TIME 62.72578692436218

8 GPU, 120 shards on input, FastFile, ShardedByS3Key, 1 Instance
[1,0]<stdout>:  1/117 - 0s - loss: 0.0031 - accuracy: 1.0000 - batch: 1.0000
[1,0]<stdout>:RUN TIME 62.7252471446991


# KINDA WORKS BUT IT IS SLOW?!?!?!?!?!?
8 GPU x 2 Instances, 120 shards on input, FastFile, FullyReplicated, Batch 64
[1,0]<stdout>: 1/58 - 0s - loss: 0.3692 - accuracy: 0.8734 - batch: 1.0000
[1,0]<stdout>:RUN TIME 113.62855267524719

8 GPU x 2 Instances, 120 shards on input, FastFile, ShardedByS3Key, Batch 64
[1,0]<stdout>:30/58 - 45s - loss: 170.1098 - accuracy: 0.1032 - batch: 0.0000e+00
[1,0]<stdout>:RUN TIME 65.18587517738342


8 GPU, 120 shards on input, Pipe, ShardedByS3Key, 1 Instance
FAILED with error: "Internal:  CRC check on header failed."

possibly beacuse only 1 GPU is visible!!!
"""
[1,7]<stdout>:7 Num GPUs: 1 [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
[1,7]<stdout>:hvd.rank 7 7
[1,3]<stdout>:3 Num GPUs: 1 [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
[1,3]<stdout>:hvd.rank 3 3
[1,6]<stdout>:6 Num GPUs: 1 [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
[1,6]<stdout>:hvd.rank 6[1,6]<stdout>: 6
[1,2]<stdout>:2 Num GPUs: 1 [1,2]<stdout>:[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
[1,2]<stdout>:hvd.rank 2[1,2]<stdout>: 2
[1,0]<stdout>:0 Num GPUs: 1 [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
[1,0]<stdout>:hvd.rank 0 0
[1,1]<stdout>:1 Num GPUs: 1 [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
[1,1]<stdout>:hvd.rank 1 1
[1,5]<stdout>:5 Num GPUs: 1 [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
[1,5]<stdout>:hvd.rank 5 5
[1,4]<stdout>:4 Num GPUs: 1 [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
[1,4]<stdout>:hvd.rank 4 4
"""
