# Dog Type Image Classification using AWS SageMaker
In this project, we use AWS Sagemaker to train a pretrained model that can perform image classification by using the Sagemaker profiling, debugger, hyperparameter tuning and other good ML engineering practices.


In [1]:
# TODO: Install any packages that you might need
# For instance, you will need the smdebug package
!pip install smdebug

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting smdebug
  Downloading smdebug-1.0.12-py2.py3-none-any.whl (270 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.1/270.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting pyinstrument==3.4.2
  Downloading pyinstrument-3.4.2-py2.py3-none-any.whl (83 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.3/83.3 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
Collecting pyinstrument-cext>=0.2.2
  Downloading pyinstrument_cext-0.2.4.tar.gz (4.8 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: pyinstrument-cext
  Building wheel for pyinstrument-cext (setup.py) ... [?25ldone
[?25h  Created wheel for pyinstrument-cext: filename=pyinstrument_cext-0.2.4-cp310-cp310-linux_x86_64.whl size=6298 sha256=5137e8a0c3055f7aef8dc032b46b86f80684e6994df132b3437d3c1074908ead
  Stored in directory: /home/ec2-use

In [1]:
from PIL import Image
import os
import sagemaker
import boto3
from sagemaker import get_execution_role
from sagemaker.pytorch import PyTorch
from sagemaker.tuner import (
    IntegerParameter,
    CategoricalParameter,
    ContinuousParameter,
    HyperparameterTuner,
)
#import smdebug
from sagemaker.debugger import Rule, DebuggerHookConfig
from sagemaker.image_uris import retrieve
import pandas as pd

## Dataset
TODO: Explain what dataset you are using for this project. Maybe even give a small overview of the classes, class distributions etc that can help anyone not familiar with the dataset get a better understand of it.

In [13]:
#Fetch and upload the data to AWS S3

# Command to download and unzip data
!wget https://s3-us-west-1.amazonaws.com/udacity-aind/dog-project/dogImages.zip
!unzip dogImages.zip

--2023-02-22 10:26:57--  https://s3-us-west-1.amazonaws.com/udacity-aind/dog-project/dogImages.zip
Resolving s3-us-west-1.amazonaws.com (s3-us-west-1.amazonaws.com)... 52.219.117.128
Connecting to s3-us-west-1.amazonaws.com (s3-us-west-1.amazonaws.com)|52.219.117.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1132023110 (1.1G) [application/zip]
Saving to: ‘dogImages.zip’


2023-02-22 10:27:22 (44.4 MB/s) - ‘dogImages.zip’ saved [1132023110/1132023110]

Archive:  dogImages.zip
   creating: dogImages/
   creating: dogImages/test/
   creating: dogImages/train/
   creating: dogImages/valid/
   creating: dogImages/test/001.Affenpinscher/
  inflating: dogImages/test/001.Affenpinscher/Affenpinscher_00003.jpg  
  inflating: dogImages/test/001.Affenpinscher/Affenpinscher_00023.jpg  
  inflating: dogImages/test/001.Affenpinscher/Affenpinscher_00036.jpg  
  inflating: dogImages/test/001.Affenpinscher/Affenpinscher_00047.jpg  
  inflating: dogImages/test/001.Affenpin

In [4]:
#get max size length among all dog imagrs to use in the hyperparameters tuning "resize"
filenames = []
for path, subdirs, files in os.walk("dogImages/train"):
    for name in files:
        filenames.append(os.path.join(path, name))
max_size = max([Image.open(f, 'r').size for f in filenames])
max_side_length = max(max_size)
max_side_length

4278

In [None]:
#to save dogImages into local S3:
sagemaker_session = sagemaker.Session()
inputs = sagemaker_session.upload_data(path="dogImages", bucket="dog-images-uploaded-from-sagemaker", key_prefix='data')

## Hyperparameter Tuning
**TODO:** This is the part where you will finetune a pretrained model with hyperparameter tuning. Remember that you have to tune a minimum of two hyperparameters. However you are encouraged to tune more. You are also encouraged to explain why you chose to tune those particular hyperparameters and the ranges.

**Note:** You will need to use the `hpo.py` script to perform hyperparameter tuning.

https://docs.aws.amazon.com/sagemaker/latest/dg/IC-TF-Hyperparameter.html

In [5]:
sess = sagemaker.Session()
role = get_execution_role()

imageclassification = sagemaker.estimator.Estimator(
    retrieve(region=sess.boto_region_name, framework="image-classification", version="1"), #https://sagemaker.readthedocs.io/en/stable/api/utility/image_uris.html
    role,
    instance_count=1,
    instance_type="ml.p3.2xlarge",
    output_path='s3://dog-images-uploaded-from-sagemaker/training-job',
    sagemaker_session=sess,
)
#all parameters should be str
imageclassification.set_hyperparameters(
    num_layers='18',
    num_classes='133',
    num_training_samples='7980',#60 images fer each class * 133 classes
    mini_batch_size='128',
    epochs='10',
    top_k="2",
    precision_dtype="float32",
    augmentation_type="crop",
    #optimizer='sgd'
    #resize= str(max_side_length), #The number of pixels in the shortest side of an image after resizing it for training. If the parameter is not set, then the training data is used without resizing. The parameter should be larger than both the width and height components of image_shape to prevent training failure.
    #use_pretrained_model= "1",
    #multi_label='0'
)

In [6]:
#TODO: Declare your HP ranges, metrics etc.
#https://docs.aws.amazon.com/sagemaker/latest/dg/IC-Hyperparameter.html
#https://docs.aws.amazon.com/sagemaker/latest/dg/automatic-model-tuning-define-ranges.html
#https://pytorch.org/tutorials/beginner/finetuning_torchvision_models_tutorial.html
'''
ClientError: An error occurred (ValidationException) when calling the CreateHyperParameterTuningJob operation: The hyperparameter tuning job that you requested has the following untunable hyperparameters: 
[batch_size, model_name, lr, test_batch_size, num_layers, epochs]. 
For the algorithm, 811284229777.dkr.ecr.us-east-1.amazonaws.com/image-classification:1, you can tune only 
[beta_1, weight_decay, beta_2, optimizer, eps, learning_rate, gamma, momentum, mini_batch_size]. Delete untunable hyperparameters.
'''
hyperparameters_ranges = {
    #"eps": IntegerParameter(10, 30), gives error, to see the error: go to sagemaker-->training-->training jobs
    "eps": ContinuousParameter(0, 1),
    #"epochs": IntegerParameter(10, 30),
    #"batch_size": CategoricalParameter([32, 64, 128, 256, 512]),
    #"batch_size": CategoricalParameter(['32', '64', '128', '256', '512']),
    #"test_batch_size": CategoricalParameter([32, 64, 128, 256, 512]),
    #"test_batch_size": CategoricalParameter(['32', '64', '128', '256', '512']),
    #"lr": ContinuousParameter(0.001, 0.1),
    "learning_rate": ContinuousParameter(0.001, 0.1),
    #"multi_label": "0", #False, #each image has only one label
    "momentum": ContinuousParameter(0, 1),
    #"num_layers": CategoricalParameter([18, 34, 50, 101, 152, 200]),
    #"num_layers": CategoricalParameter(['18', '34', '50', '101', '152', '200']),
    "optimizer": CategoricalParameter(["adam", "sgd", "rmsprop"]), #"nesterov", "adagrad" , "adadelta"]),
    "weight_decay": ContinuousParameter(0.0, 0.99),
    #"model_name": CategoricalParameter(['resnet', 'alexnet', 'vgg', 'squeezenet', 'densenet', 'inception'])
}

#objective_metric_name = "average test loss"
#objective_type = "Minimize"
#metric_definitions = [{"Name": "average test loss", "Regex": "Test set: Average loss: ([0-9\\.]+)"}]


#objective_metric_name = "Accuracy"
#objective_type = "Maximize"
#metric_definitions = [{"Name": "Accuracy", "Regex": "Test set: Average loss: ([0-9\\.]+), Accuracy: ([0-9]+)/([0-9]+)"}]

objective_metric_name = "validation:accuracy"
objective_type = "Maximize"

In [7]:
#TODO: Create estimators for your HPs
'''
estimator = PyTorch(
    entry_point= "hpo.py",
    py_version= "py36",
    role= role,
    framework_version= "1.8",
    instance_count= 1,
    instance_type= "ml.m5.large",
    output_path= "s3://dog-images-uploaded-from-sagemaker/training-job"
)
'''
tuner = HyperparameterTuner(
    #estimator=estimator,
    estimator=imageclassification,
    objective_metric_name=objective_metric_name,
    hyperparameter_ranges=hyperparameters_ranges,
    #metric_definitions=metric_definitions,
    max_jobs=4,
    #max_parallel_jobs=2,
    max_parallel_jobs=1, #if more than one, gives Resource limit error
    objective_type=objective_type,
)

In [9]:
#inputs = "s3://dog-images-uploaded-from-sagemaker/data"
print(inputs)

s3://dog-images-uploaded-from-sagemaker/data


A .lst file is a tab-separated file with three columns that contains a list of image files. The first column specifies the image index, the second column specifies the class label index for the image, and the third column specifies the relative path of the image file.

In [10]:
def create_lst_file(train_or_valid, path):
    df_list = pd.DataFrame(columns=['class', 'img_path'])
    for (root,dirs,files) in os.walk(path+train_or_valid, topdown=True):
        for file in files:
            index = len(df_list)
            class_ = file.replace(file.split('_')[-1],"") #remove number at the end of the file name and keep class name
            class_ = class_[:-1] #remove last underscore
            path_img = root.replace("dogImages/","").replace(train_or_valid+'/',"") +'/'+ file #no folder dogImages in S3 bucket
            class_number = os.path.dirname(path_img).split('.')[0].replace(train_or_valid+'/',"")
            #df_list.loc[index] = [class_, path_img]
            df_list.loc[index] = [class_number, path_img]
    print(df_list.head())
    df_list.to_csv(path+train_or_valid+'.lst', sep='\t', index=True, header=False)
    

In [11]:
create_lst_file('train', 'dogImages/')
create_lst_file('valid', 'dogImages/')

  class                                           img_path
0   073  073.German_wirehaired_pointer/German_wirehaire...
1   073  073.German_wirehaired_pointer/German_wirehaire...
2   073  073.German_wirehaired_pointer/German_wirehaire...
3   073  073.German_wirehaired_pointer/German_wirehaire...
4   073  073.German_wirehaired_pointer/German_wirehaire...
  class                                           img_path
0   073  073.German_wirehaired_pointer/German_wirehaire...
1   073  073.German_wirehaired_pointer/German_wirehaire...
2   073  073.German_wirehaired_pointer/German_wirehaire...
3   073  073.German_wirehaired_pointer/German_wirehaire...
4   073  073.German_wirehaired_pointer/German_wirehaire...



I tried this function and got this error:ClientError: lst should at least has three parts, but only has 2 parts for ['073.German_wirehaired_pointer/German_wirehaired_pointer_05036.jpg', 'German_wirehaired_pointer_'], exit code: 2
format: path_of_image /t class
'''python
def create_lst_file(train_or_valid_or_test): 
    with open("dogImages/"+train_or_valid_or_test+"/"+train_or_valid_or_test+".lst", "a") as f:
        for dirs in os.listdir("dogImages/"+train_or_valid_or_test):
            folder = dirs
            if(folder != train_or_valid_or_test+'.lst'):
                for file in os.listdir("dogImages/"+train_or_valid_or_test+"/"+folder):
                    class_ = file.replace(file.split('_')[-1],"")
                    f.writelines([folder+"/"+file+'\t'+class_+'\n'])
    #f.close()
'''

create_lst_file('train')
create_lst_file('valid')

In [12]:
#upload to s3
#https://sagemaker.readthedocs.io/en/stable/api/utility/session.html
#https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.upload_file
#https://boto3.amazonaws.com/v1/documentation/api/latest/reference/customizations/s3.html#boto3.s3.transfer.S3Transfer.ALLOWED_UPLOAD_ARGS
sagemaker_session = sagemaker.Session()
train_lst = sagemaker_session.upload_data("dogImages/train.lst", bucket="dog-images-uploaded-from-sagemaker", key_prefix='data/lst_files', extra_args={'ContentType':'application/x-image'})
valid_lst = sagemaker_session.upload_data("dogImages/valid.lst", bucket="dog-images-uploaded-from-sagemaker", key_prefix='data/lst_files', extra_args={'ContentType':'application/x-image'})

In [13]:
print(train_lst)
print(valid_lst)

s3://dog-images-uploaded-from-sagemaker/data/lst_files/train.lst
s3://dog-images-uploaded-from-sagemaker/data/lst_files/valid.lst


In [14]:
#https://sagemaker-examples.readthedocs.io/en/latest/introduction_to_amazon_algorithms/imageclassification_caltech/Image-classification-lst-format-highlevel.html

s3_input_train = sagemaker.inputs.TrainingInput(s3_data=inputs+'/train', content_type='application/jpeg')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data=inputs+'/valid', content_type='application/jpeg')
train_lst = sagemaker.inputs.TrainingInput(s3_data=train_lst, content_type='application/x-image')#change the type of train_lst and attach content type
valid_lst = sagemaker.inputs.TrainingInput(s3_data=valid_lst, content_type='application/x-image')#change the type of valid_lst and attach content type



channels = {
    'train':s3_input_train,
    'validation':s3_input_validation,
    'train_lst': train_lst,
    'validation_lst': valid_lst
}

In [15]:
print(channels)

{'train': <sagemaker.inputs.TrainingInput object at 0x7f4a8de5c6d0>, 'validation': <sagemaker.inputs.TrainingInput object at 0x7f4a8de5c610>, 'train_lst': <sagemaker.inputs.TrainingInput object at 0x7f4a8de5c5e0>, 'validation_lst': <sagemaker.inputs.TrainingInput object at 0x7f4a8de7e6b0>}


In [16]:
print(role)
#arn:aws:iam::306523776574:role/service-role/AmazonSageMaker-ExecutionRole-20230220T065503
#let this role has full access to s3

arn:aws:iam::306523776574:role/service-role/AmazonSageMaker-ExecutionRole-20230220T065503


In [100]:
# https://stackoverflow.com/questions/54432761/amazon-sagemaker-hyperparameter-tuning-error-for-built-in-algorithm-using-the-py
tuner.fit(channels, include_cls_metadata=False, wait=True, job_name="hyperparameters-tuning-job-33", log=True) 

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


..................................................................................................................................................................................................................................................................................................................!


To see errors for individual trials: go to sagemaker-->training-->training jobs

## Model Profiling and Debugging

In [2]:
tuning_job_name = "hyperparameters-tuning-job-33"

tuner_parent_metrics = sagemaker.HyperparameterTuningJobAnalytics(tuning_job_name)
if not tuner_parent_metrics.dataframe().empty:
    df_parent = tuner_parent_metrics.dataframe().sort_values(
        ["FinalObjectiveValue"], ascending=False
    )

df_parent

Unnamed: 0,eps,learning_rate,momentum,optimizer,weight_decay,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
0,0.944849,0.046427,0.82949,adam,0.684004,hyperparameters-tuning-job-33-004-852e0853,Completed,0.029948,2023-02-25 15:33:27+00:00,2023-02-25 15:38:10+00:00,283.0
3,0.66971,0.054404,0.39747,adam,0.988658,hyperparameters-tuning-job-33-001-ac2a98c4,Completed,0.018229,2023-02-25 15:14:04+00:00,2023-02-25 15:21:37+00:00,453.0
1,0.362381,0.00309,0.079847,sgd,0.977134,hyperparameters-tuning-job-33-003-cb68f8fc,Completed,0.013021,2023-02-25 15:28:13+00:00,2023-02-25 15:33:01+00:00,288.0
2,0.210714,0.00227,0.093384,rmsprop,0.824209,hyperparameters-tuning-job-33-002-0cb7a40c,Completed,0.009115,2023-02-25 15:22:58+00:00,2023-02-25 15:27:46+00:00,288.0


In [17]:
import bokeh
import bokeh.io

bokeh.io.output_notebook()
from bokeh.plotting import figure, show
from bokeh.models import HoverTool

import pandas as pd

df_parent_objective_value = df_parent[df_parent["FinalObjectiveValue"] > -float("inf")]

p = figure(
    width=900,
    height=400,
    x_axis_type="datetime",
    x_axis_label="datetime",
    y_axis_label=objective_metric_name,
)
p.circle(
    source=df_parent_objective_value, x="TrainingStartTime", y="FinalObjectiveValue", color="black"
)

show(p)

The best score was 0.029948 which is considered too much bad!

In [25]:
sagemaker.Session().logs_for_job("hyperparameters-tuning-job-33-004-852e0853") #this is the best estimator

2023-02-25 15:38:12 Starting - Found matching resource for reuse
2023-02-25 15:38:12 Downloading - Downloading input data
2023-02-25 15:38:12 Training - Training image download completed. Training in progress.
2023-02-25 15:38:12 Uploading - Uploading generated training model
2023-02-25 15:38:12 Completed - Resource released due to keep alive period expiry[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34mNvidia gpu devices, drivers and cuda toolkit versions (only available on hosts with GPU):[0m
[34mSat Feb 25 15:34:09 2023       [0m
[34m+-----------------------------------------------------------------------------+[0m
[34m| NVIDIA-SMI 515.65.07    Driver Version: 515.65.07    CUDA Version: 11.7     |[0m
[34m|-------------------------------+----------------------+----------------------+[0m
[34m| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |[0m
[34m| Fan  Temp  Perf  Pwr:Us

It is noted that the last epoch didn't score the best validation-accuracy

In [None]:
# TODO: Create and fit an estimator

estimator = # TODO: Your estimator here

In [None]:
# TODO: Plot a debugging output.

**TODO**: Is there some anomalous behaviour in your debugging output? If so, what is the error and how will you fix it?  
**TODO**: If not, suppose there was an error. What would that error look like and how would you have fixed it?

In [None]:
# TODO: Display the profiler output

## Model Deploying

In [28]:
#Deploy your model to an endpoint
attached_estimator = sagemaker.estimator.Estimator.attach("hyperparameters-tuning-job-33-004-852e0853")
predictor=attached_estimator.deploy(initial_instance_count=1, instance_type="ml.m5.large", endpoint_name="Dog-Classification-Endpoint", use_compiled_model=False, wait=True) # TODO: Add your deployment configuration like instance type and number of instances


2023-02-25 15:38:12 Starting - Found matching resource for reuse
2023-02-25 15:38:12 Downloading - Downloading input data
2023-02-25 15:38:12 Training - Training image download completed. Training in progress.
2023-02-25 15:38:12 Uploading - Uploading generated training model
2023-02-25 15:38:12 Completed - Resource released due to keep alive period expiry
---------!

In [54]:
# https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_runtime_InvokeEndpoint.html
#https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-inference.html
#https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-online-explainability-create-endpoint.html#clarify-online-explainability-create-endpoint-enable
'''ValidationError: An error occurred (ValidationError) when calling the InvokeEndpoint operation: Endpoint Dog-Classification-Endpoint does not support explanations. Please define an explainer in your endpoint configuration and try again.'''
import io
for (root, dirs, files) in os.walk("dogImages/test"):
    for file in files:
        path = root+os.path.dirname(file)+"/"+file
        with open(path, "rb") as f:
            image = Image.open(f)  
            with io.BytesIO() as output:
                image.save(output, format='JPEG')
                image_bytes = output.getvalue()
                response = boto3.client('sagemaker-runtime').invoke_endpoint(EndpointName="Dog-Classification-Endpoint", Body=image_bytes, ContentType='image/jpeg', Accept='application/json')
                print(path, response)

dogImages/test/073.German_wirehaired_pointer/German_wirehaired_pointer_05027.jpg {'ResponseMetadata': {'RequestId': '3fa7d948-a652-452c-99b9-c909c65b8c86', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '3fa7d948-a652-452c-99b9-c909c65b8c86', 'x-amzn-invoked-production-variant': 'AllTraffic', 'date': 'Mon, 27 Feb 2023 00:54:49 GMT', 'content-type': 'application/json', 'content-length': '2913'}, 'RetryAttempts': 0}, 'ContentType': 'application/json', 'InvokedProductionVariant': 'AllTraffic', 'Body': <botocore.response.StreamingBody object at 0x7f4a84ec8b20>}
dogImages/test/073.German_wirehaired_pointer/German_wirehaired_pointer_05060.jpg {'ResponseMetadata': {'RequestId': 'e183c897-0a22-40f3-8dcf-4533272f4f0c', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'e183c897-0a22-40f3-8dcf-4533272f4f0c', 'x-amzn-invoked-production-variant': 'AllTraffic', 'date': 'Mon, 27 Feb 2023 00:54:49 GMT', 'content-type': 'application/json', 'content-length': '2923'}, 'RetryAttempts

In [55]:
#Clean -up
predictor.delete_endpoint()

# Export logs into pdf/html file

In [6]:
!pip install timedelta

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting timedelta
  Downloading timedelta-2020.12.3.tar.gz (1.6 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: timedelta
  Building wheel for timedelta (setup.py) ... [?25ldone
[?25h  Created wheel for timedelta: filename=timedelta-2020.12.3-py3-none-any.whl size=1555 sha256=64d6c6e856062c633831bbff3c59fcbfd0e95f1e3fa7e63f482bec03eda4d7fe
  Stored in directory: /home/ec2-user/.cache/pip/wheels/8a/d1/d4/185632835c73a24d14d7989c1b37e5efcda78093173b7e6d4d
Successfully built timedelta
Installing collected packages: timedelta
Successfully installed timedelta-2020.12.3


In [47]:
'''
https://docs.aws.amazon.com/cli/latest/reference/logs/create-export-task.html
--from (long)

The start time of the range for the request, expressed as the number of milliseconds after Jan 1, 1970 00:00:00 UTC . Events with a timestamp earlier than this time are not exported.
--to (long)

The end time of the range for the request, expressed as the number of milliseconds after Jan 1, 1970 00:00:00 UTC . Events with a timestamp later than this time are not exported.

You must specify a time that is not earlier than when this log group was created.
'''
import datetime, timedelta, time
#jan 1, 1970 is from 53 years and 2 months = 53*365.25 + 31 days in jan + 28 days in feb + 2 days in march = 18868 + 60 = 18941.25 days
#end_time = datetime.datetime.now()-datetime.timedelta(days=19000)
#end_time = datetime.datetime.now()
#print(end_time)
#end_time_milliseconds = int(end_time.timestamp()*1000)
#end_time_milliseconds = int(datetime.datetime.now().astimezone().replace(microsecond=0).timestamp())
#start_time = end_time - datetime.timedelta(days=6)
#start_time_milliseconds = int(start_time.timestamp()*1000) 
#start_time_milliseconds = int((end_time-datetime.timedelta(days=10)).timestamp())
#print(start_time)
#print(end_time_milliseconds)
#print(start_time_milliseconds)
# Set the start and end time for the time range
start_time = datetime.datetime(2023, 2, 23, 0, 0)
end_time = datetime.datetime(2023, 2, 27, 0, 0)#start_time-datetime.timedelta(hours=1)

# Convert the start and end time to Unix epoch time values in milliseconds
from_time = int(time.mktime(start_time.timetuple()) * 1000)
to_time = int(time.mktime(end_time.timetuple()) * 1000)
!aws logs create-export-task --task-name export-cloud-watch-to-S3 --log-group-name /aws/sagemaker/TrainingJobs --from $from_time --to $to_time --destination dog-images-uploaded-from-sagemaker --destination-prefix logs/

{
    "taskId": "98091c28-09cd-49ff-ba0a-850805402f85"
}


In [55]:
!aws logs create-export-task --task-name export-cloud-watch-to-S3-text --log-group-name /aws/sagemaker/TrainingJobs --from $from_time --to $to_time --destination dog-images-uploaded-from-sagemaker --destination-prefix logs/ --output text --query taskId

263822bc-a81d-495d-b73f-f28b92ffd8f3


In [56]:
!aws logs describe-export-tasks --task-id 263822bc-a81d-495d-b73f-f28b92ffd8f3

{
    "exportTasks": [
        {
            "taskId": "263822bc-a81d-495d-b73f-f28b92ffd8f3",
            "taskName": "export-cloud-watch-to-S3-text",
            "logGroupName": "/aws/sagemaker/TrainingJobs",
            "from": 1677218408819,
            "to": 1677456000000,
            "destination": "dog-images-uploaded-from-sagemaker",
            "destinationPrefix": "logs/",
            "status": {
                "code": "COMPLETED",
                "message": "Completed successfully"
            },
            "executionInfo": {
                "creationTime": 1677758666343,
                "completionTime": 1677758670042
            }
        }
    ]
}
