Dataset size:

* Training Dataset - input: (1567893, 75), output: (1567893,)
* Validation Dataset - input: (174211, 75), output: (174211,)


In [2]:
import sagemaker
import os
import boto3
import sys
import pandas as pd
from IPython.core.display import display, HTML
from sagemaker.tensorflow import TensorFlow
from sagemaker.debugger import Rule, rule_configs, ProfilerConfig, ProfilerRule, FrameworkProfile, DetailedProfilingConfig, DataloaderProfilingConfig, PythonProfilingConfig, PythonProfiler, cProfileTimer, CollectionConfig, DebuggerHookConfig

In [3]:
region = boto3.Session().region_name
role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.Session()
bucket_name = sagemaker.Session().default_bucket()

prefix = 'ipinyou-tf'
os.environ["AWS_REGION"] = region

print(f'Region:  {region}')
print(f'IAM Role:  {role}')
print(f'S3 Bucket:  {bucket_name}')

# Get the current installed version of Sagemaker SDK, TensorFlow, Python, Boto3 and SMDebug
print(f'SageMaker Python SDK version:  {sagemaker.__version__}')
#print(f'TensorFlow version : {tf.__version__}')
print(f'Python version:  {sys.version}')
print(f'Boto3 version : {boto3.__version__}')
print(f'Pandas Version:  {pd.__version__}')

Region:  us-east-1
IAM Role:  arn:aws:iam::431615879134:role/sagemaker-test-role
S3 Bucket:  sagemaker-us-east-1-431615879134
SageMaker Python SDK version:  2.99.0
Python version:  3.7.10 (default, Jun  4 2021, 14:48:32) 
[GCC 7.5.0]
Boto3 version : 1.24.27
Pandas Version:  1.3.5


In [22]:
def call_training(config):
    print(config)
    
    # configure debugger
    rules=[
        ProfilerRule.sagemaker(rule_configs.ProfilerReport()),
        Rule.sagemaker(rule_configs.loss_not_decreasing()),
        Rule.sagemaker(rule_configs.overfit()),
        Rule.sagemaker(rule_configs.overtraining()),
        Rule.sagemaker(
            base_config=rule_configs.stalled_training_rule(),
            rule_parameters={
                    "threshold": "1800",
                    "stop_training_on_fire": "True",
                    "training_job_name_prefix": config['job_name']
            },
            collections_to_save=[ 
                CollectionConfig(
                    name="losses", 
                    parameters={
                        "save_interval": "500"
                    } 
                )
            ]
        )
    ]
    debugger_hook_config = DebuggerHookConfig(
        hook_parameters={"save_interval":"2000"}, # steps
        collection_configs=[
            CollectionConfig(name="inputs"),
            CollectionConfig(name="outputs"),
            CollectionConfig(name="layers"),
            CollectionConfig(name="gradients")
        ])
    
    profiler_config = ProfilerConfig(
        system_monitor_interval_millis=200,
        framework_profile_params=FrameworkProfile()
    )
    
    # configure input data
    if config['data_type'].lower() == 'csv':
        
        # single CSV File
        s3_input_train = sagemaker.inputs.TrainingInput(s3_data=f's3://{bucket_name}/{prefix}/data/train', content_type='csv', distribution='ShardedByS3Key')
        s3_input_test = sagemaker.inputs.TrainingInput(s3_data=f's3://{bucket_name}/{prefix}/data/test', content_type='csv', distribution='ShardedByS3Key')
        
    elif config['data_type'].lower() == 'parquet':
        
        # group of parquet files
        s3_input_train = sagemaker.inputs.TrainingInput(s3_data=f's3://sagemaker-us-east-1-431615879134/ipinyou-tf/data/parquet_train/', distribution='ShardedByS3Key')
        s3_input_test = sagemaker.inputs.TrainingInput(s3_data=f's3://sagemaker-us-east-1-431615879134/ipinyou-tf/data/parquet_test/', distribution='ShardedByS3Key')
        
    elif config['data_type'].lower() == 'tfrecord':
        
        # group of tfrecord files
        s3_input_train = sagemaker.inputs.TrainingInput(s3_data=f's3://{bucket_name}/{prefix}/data/train_tfr', distribution='ShardedByS3Key')
        s3_input_test = sagemaker.inputs.TrainingInput(s3_data=f's3://{bucket_name}/{prefix}/data/test_tfr', distribution='ShardedByS3Key')
    
    elif config['data_type'].lower() == 'parquetffm':
        
        # group of parquet files
        s3_input_train = sagemaker.inputs.TrainingInput(s3_data=f's3://sagemaker-us-east-1-431615879134/ipinyou-tf/data/parquet_train/', distribution='ShardedByS3Key', input_mode='FastFile')
        s3_input_test = sagemaker.inputs.TrainingInput(s3_data=f's3://sagemaker-us-east-1-431615879134/ipinyou-tf/data/parquet_test/', distribution='ShardedByS3Key', input_mode='FastFile')
        
    elif config['data_type'].lower() == 'tfrecordffm':
        
        # group of tfrecord files
        s3_input_train = sagemaker.inputs.TrainingInput(s3_data=f's3://{bucket_name}/{prefix}/data/train_tfr', distribution='ShardedByS3Key', input_mode='FastFile')
        s3_input_test = sagemaker.inputs.TrainingInput(s3_data=f's3://{bucket_name}/{prefix}/data/test_tfr', distribution='ShardedByS3Key', input_mode='FastFile')
    
    else:
        print('Unknown datatype specified')
        
    # specify hyper parameters
    hyperparameters = {'epochs': 5,
                       'batchsize': config['batchsize'],
                       'lr': 4e-3
                      }
    
    # call training job
    estimator = TensorFlow(
        base_job_name=config['job_name'],
        source_dir="code",
        entry_point=config['code'],
        role=role,
        py_version="py37",
        framework_version="2.3.1",
        instance_count=1,
        instance_type=config['compute'],
        sagemaker_session=sagemaker_session,
        profiler_config=profiler_config,
        rules=rules,
        debugger_hook_config=debugger_hook_config,
        hyperparameters=hyperparameters
)
    estimator.fit({'train': s3_input_train, 'validation': s3_input_test})
    print(estimator.latest_training_job.job_name)
    return estimator.latest_training_job.job_name

    

## Training with CSV

In [15]:
training_config = [
    {
        'job_name':'ipinyou-csv-m52xl-inmem2048',
        'compute':'ml.m5.2xlarge',
        'data_type':'csv',
        'code':'tf_profile_fit_csv_inmem.py',
        'batchsize':2048,
    }, 
    
    {
        'job_name':'ipinyou-csv-c42xl-inmem2048',
        'compute':'ml.c4.2xlarge',
        'data_type':'csv',
        'code':'tf_profile_fit_csv_inmem.py',
        'batchsize':2048,
    }, 
    
    {
        'job_name':'ipinyou-csv-g4dnxl-inmem2048',
        'compute':'ml.g4dn.xlarge',
        'data_type':'csv',
        'code':'tf_profile_fit_csv_inmem.py',
        'batchsize':2048,
    }, 
    
    {
        'job_name':'ipinyou-csv-p32xl-inmem2048',
        'compute':'ml.p3.2xlarge',
        'data_type':'csv',
        'code':'tf_profile_fit_csv_inmem.py',
        'batchsize':2048,
    }, 
    
    {
        'job_name':'ipinyou-csv-m52xl-inmem4096',
        'compute':'ml.m5.2xlarge',
        'data_type':'csv',
        'code':'tf_profile_fit_csv_inmem.py',
        'batchsize':4096,
    }, 
    
    {
        'job_name':'ipinyou-csv-c42xl-inmem4096',
        'compute':'ml.c4.2xlarge',
        'data_type':'csv',
        'code':'tf_profile_fit_csv_inmem.py',
        'batchsize':4096,
    }, 
    
    {
        'job_name':'ipinyou-csv-g4dnxl-inmem4096',
        'compute':'ml.g4dn.xlarge',
        'data_type':'csv',
        'code':'tf_profile_fit_csv_inmem.py',
        'batchsize':4096,
    }, 
    
    {
        'job_name':'ipinyou-csv-p32xl-inmem4096',
        'compute':'ml.p3.2xlarge',
        'data_type':'csv',
        'code':'tf_profile_fit_csv_inmem.py',
        'batchsize':4096,
    }, 
    
     {
        'job_name':'ipinyou-csvdl-m52xl-inmem2048',
        'compute':'ml.m5.2xlarge',
        'data_type':'csv',
        'code':'tf_profile_fit_csv.py',
        'batchsize':2048,
    }, 
    
    {
        'job_name':'ipinyou-csvdl-c42xl-inmem2048',
        'compute':'ml.c4.2xlarge',
        'data_type':'csv',
        'code':'tf_profile_fit_csv.py',
        'batchsize':2048,
    }, 
    
    {
        'job_name':'ipinyou-csvdl-g4dnxl-inmem2048',
        'compute':'ml.g4dn.xlarge',
        'data_type':'csv',
        'code':'tf_profile_fit_csv.py',
        'batchsize':2048,
    }, 
    
    {
        'job_name':'ipinyou-csvdl-p32xl-inmem2048',
        'compute':'ml.p3.2xlarge',
        'data_type':'csv',
        'code':'tf_profile_fit_csv.py',
        'batchsize':2048,
    }, 
    
    {
        'job_name':'ipinyou-csvdl-m52xl-inmem4096',
        'compute':'ml.m5.2xlarge',
        'data_type':'csv',
        'code':'tf_profile_fit_csv.py',
        'batchsize':4096,
    }, 
    
    {
        'job_name':'ipinyou-csvdl-c42xl-inmem4096',
        'compute':'ml.c4.2xlarge',
        'data_type':'csv',
        'code':'tf_profile_fit_csv.py',
        'batchsize':4096,
    }, 
    
    {
        'job_name':'ipinyou-csvdl-g4dnxl-inmem4096',
        'compute':'ml.g4dn.xlarge',
        'data_type':'csv',
        'code':'tf_profile_fit_csv.py',
        'batchsize':4096,
    }, 
    
    {
        'job_name':'ipinyou-csvdl-p32xl-inmem4096',
        'compute':'ml.p3.2xlarge',
        'data_type':'csv',
        'code':'tf_profile_fit_csv.py',
        'batchsize':4096,
    }, 
    
]

Note - Each training jobs will run sequential

In [None]:
namelist = []
for train in training_config:
    name = call_training(train)
    namelist.append(name)
    

{'job_name': 'ipinyou-csv-m52xl-inmem2048', 'compute': 'ml.m5.2xlarge', 'data_type': 'csv', 'code': 'tf_profile_fit_csv_inmem.py', 'batchsize': 2048}
2022-08-13 18:15:53 Starting - Starting the training job...
2022-08-13 18:16:18 Starting - Preparing the instances for trainingLossNotDecreasing: InProgress
Overfit: InProgress
Overtraining: InProgress
StalledTrainingRule: InProgress
ProfilerReport: InProgress
.........
2022-08-13 18:17:45 Downloading - Downloading input data
2022-08-13 18:17:45 Training - Downloading the training image...
2022-08-13 18:18:18 Training - Training image download completed. Training in progress.[34m2022-08-13 18:18:13.323088: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.[0m
[34m2022-08-13 18:18:13.327750: I tensorflow/core/profiler/internal/smprofiler_config_reader.cc:123] PID of the process that is writing to the timeline : 1[0m
[34m2022-08-13 18:18:13.328443: I tensorflow/core/profiler/internal/sm

In [11]:
with open('trail_one.txt', 'w') as f:
    for line in namelist:
        f.write(f"{line}\n")

## Training with TF Record

In [26]:
training_config = [
    {
        'job_name':'ipinyou-tf-m52xl-2048',
        'compute':'ml.m5.2xlarge',
        'data_type':'tfrecord',
        'code':'tf_profile_fit_tfrecord.py',
        'batchsize':2048,
    }, 
    
    {
        'job_name':'ipinyou-tf-c42xl-2048',
        'compute':'ml.c4.2xlarge',
        'data_type':'tfrecord',
        'code':'tf_profile_fit_tfrecord.py',
        'batchsize':2048,
    }, 
    
    {
        'job_name':'ipinyou-tf-g4dnxl-2048',
        'compute':'ml.g4dn.xlarge',
        'data_type':'tfrecord',
        'code':'tf_profile_fit_tfrecord.py',
        'batchsize':2048,
    }, 
    
    {
        'job_name':'ipinyou-tf-p32xl-2048',
        'compute':'ml.p3.2xlarge',
        'data_type':'tfrecord',
        'code':'tf_profile_fit_tfrecord.py',
        'batchsize':2048,
    }, 
    
    {
        'job_name':'ipinyou-tf-m52xl-4096',
        'compute':'ml.m5.2xlarge',
        'data_type':'tfrecord',
        'code':'tf_profile_fit_tfrecord.py',
        'batchsize':4096,
    }, 
    
    {
        'job_name':'ipinyou-tf-c42xl-4096',
        'compute':'ml.c4.2xlarge',
        'data_type':'tfrecord',
        'code':'tf_profile_fit_tfrecord.py',
        'batchsize':4096,
    }, 
    
    {
        'job_name':'ipinyou-tf-g4dnxl-4096',
        'compute':'ml.g4dn.xlarge',
        'data_type':'tfrecord',
        'code':'tf_profile_fit_tfrecord.py',
        'batchsize':4096,
    }, 
    
    {
        'job_name':'ipinyou-tf-p32xl-4096',
        'compute':'ml.p3.2xlarge',
        'data_type':'tfrecord',
        'code':'tf_profile_fit_tfrecord.py',
        'batchsize':4096,
    }, 
    ##
     {
        'job_name':'ipinyou-tfffm-m52xl-2048',
        'compute':'ml.m5.2xlarge',
        'data_type':'tfrecordffm',
        'code':'tf_profile_fit_tfrecord.py',
        'batchsize':2048,
    }, 
    
    {
        'job_name':'ipinyou-tfffm-c42xl-2048',
        'compute':'ml.c4.2xlarge',
        'data_type':'tfrecordffm',
        'code':'tf_profile_fit_tfrecord.py',
        'batchsize':2048,
    }, 
    
    {
        'job_name':'ipinyou-tfffm-g4dnxl-2048',
        'compute':'ml.g4dn.xlarge',
        'data_type':'tfrecordffm',
        'code':'tf_profile_fit_tfrecord.py',
        'batchsize':2048,
    }, 
    
    {
        'job_name':'ipinyou-tfffm-p32xl-2048',
        'compute':'ml.p3.2xlarge',
        'data_type':'tfrecordffm',
        'code':'tf_profile_fit_tfrecord.py',
        'batchsize':2048,
    }, 
    
    {
        'job_name':'ipinyou-tfffm-m52xl-4096',
        'compute':'ml.m5.2xlarge',
        'data_type':'tfrecordffm',
        'code':'tf_profile_fit_tfrecord.py',
        'batchsize':4096,
    }, 
    
    {
        'job_name':'ipinyou-tfffm-c42xl-4096',
        'compute':'ml.c4.2xlarge',
        'data_type':'tfrecordffm',
        'code':'tf_profile_fit_tfrecord.py',
        'batchsize':4096,
    }, 
    
    {
        'job_name':'ipinyou-tfffm-g4dnxl-4096',
        'compute':'ml.g4dn.xlarge',
        'data_type':'tfrecordffm',
        'code':'tf_profile_fit_tfrecord.py',
        'batchsize':4096,
    }, 
    
    {
        'job_name':'ipinyou-tfffm-p32xl-4096',
        'compute':'ml.p3.2xlarge',
        'data_type':'tfrecordffm',
        'code':'tf_profile_fit_tfrecord.py',
        'batchsize':4096,
    }, 
    
]

In [None]:
namelist = []
for train in training_config:
    name = call_training(train)
    namelist.append(name)
    

In [28]:
with open('trail_two.txt', 'w') as f:
    for line in namelist:
        f.write(f"{line}\n")

## Training with Parquet

In [23]:
training_config = [
    {
        'job_name':'ipinyou-pq-m52xl-2048',
        'compute':'ml.m5.2xlarge',
        'data_type':'parquet',
        'code':'tf_profile_fit_parquet.py',
        'batchsize':2048,
    }, 
    
    {
        'job_name':'ipinyou-pq-c42xl-2048',
        'compute':'ml.c4.2xlarge',
        'data_type':'parquet',
        'code':'tf_profile_fit_parquet.py',
        'batchsize':2048,
    }, 
    
    {
        'job_name':'ipinyou-pq-g4dnxl-2048',
        'compute':'ml.g4dn.xlarge',
        'data_type':'parquet',
        'code':'tf_profile_fit_parquet.py',
        'batchsize':2048,
    }, 
    
    {
        'job_name':'ipinyou-pq-p32xl-2048',
        'compute':'ml.p3.2xlarge',
        'data_type':'parquet',
        'code':'tf_profile_fit_parquet.py',
        'batchsize':2048,
    }, 
    
    {
        'job_name':'ipinyou-pq-m52xl-4096',
        'compute':'ml.m5.2xlarge',
        'data_type':'parquet',
        'code':'tf_profile_fit_parquet.py',
        'batchsize':4096,
    }, 
    
    {
        'job_name':'ipinyou-pq-c42xl-4096',
        'compute':'ml.c4.2xlarge',
        'data_type':'parquet',
        'code':'tf_profile_fit_parquet.py',
        'batchsize':4096,
    }, 
    
    {
        'job_name':'ipinyou-pq-g4dnxl-4096',
        'compute':'ml.g4dn.xlarge',
        'data_type':'parquet',
        'code':'tf_profile_fit_parquet.py',
        'batchsize':4096,
    }, 
    
    {
        'job_name':'ipinyou-pq-p32xl-4096',
        'compute':'ml.p3.2xlarge',
        'data_type':'parquet',
        'code':'tf_profile_fit_parquet.py',
        'batchsize':4096,
    }, 
    ##
     {
        'job_name':'ipinyou-pqffm-m52xl-2048',
        'compute':'ml.m5.2xlarge',
        'data_type':'parquetffm',
        'code':'tf_profile_fit_parquet.py',
        'batchsize':2048,
    }, 
    
    {
        'job_name':'ipinyou-pqffm-c42xl-2048',
        'compute':'ml.c4.2xlarge',
        'data_type':'parquetffm',
        'code':'tf_profile_fit_parquet.py',
        'batchsize':2048,
    }, 
    
    {
        'job_name':'ipinyou-pqffm-g4dnxl-2048',
        'compute':'ml.g4dn.xlarge',
        'data_type':'parquetffm',
        'code':'tf_profile_fit_parquet.py',
        'batchsize':2048,
    }, 
    
    {
        'job_name':'ipinyou-pqffm-p32xl-2048',
        'compute':'ml.p3.2xlarge',
        'data_type':'parquetffm',
        'code':'tf_profile_fit_parquet.py',
        'batchsize':2048,
    }, 
    
    {
        'job_name':'ipinyou-pqffm-m52xl-4096',
        'compute':'ml.m5.2xlarge',
        'data_type':'parquetffm',
        'code':'tf_profile_fit_parquet.py',
        'batchsize':4096,
    }, 
    
    {
        'job_name':'ipinyou-pqffm-c42xl-4096',
        'compute':'ml.c4.2xlarge',
        'data_type':'parquetffm',
        'code':'tf_profile_fit_parquet.py',
        'batchsize':4096,
    }, 
    
    {
        'job_name':'ipinyou-pqffm-g4dnxl-4096',
        'compute':'ml.g4dn.xlarge',
        'data_type':'parquetffm',
        'code':'tf_profile_fit_parquet.py',
        'batchsize':4096,
    }, 
    
    {
        'job_name':'ipinyou-pqffm-p32xl-4096',
        'compute':'ml.p3.2xlarge',
        'data_type':'parquetffm',
        'code':'tf_profile_fit_parquet.py',
        'batchsize':4096,
    }, 
    
]

In [24]:
namelist = []
for train in training_config:
    name = call_training(train)
    namelist.append(name)
    

{'job_name': 'ipinyou-pq-m52xl-2048', 'compute': 'ml.m5.2xlarge', 'data_type': 'parquet', 'code': 'tf_profile_fit_parquet.py', 'batchsize': 2048}
2022-08-15 13:44:58 Starting - Starting the training job...
2022-08-15 13:45:22 Starting - Preparing the instances for trainingLossNotDecreasing: InProgress
Overfit: InProgress
Overtraining: InProgress
StalledTrainingRule: InProgress
ProfilerReport: InProgress
............
2022-08-15 13:47:30 Downloading - Downloading input data
2022-08-15 13:47:30 Training - Downloading the training image...
2022-08-15 13:48:00 Training - Training image download completed. Training in progress..[34m2022-08-15 13:47:55.022269: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.[0m
[34m2022-08-15 13:47:55.026051: I tensorflow/core/profiler/internal/smprofiler_config_reader.cc:123] PID of the process that is writing to the timeline : 1[0m
[34m2022-08-15 13:47:55.026732: I tensorflow/core/profiler/internal/sm

In [25]:
with open('trail_three.txt', 'w') as f:
    for line in namelist:
        f.write(f"{line}\n")