<h1>C4 Solution</h1>

<h3>Get the data and copy it to S3</h3>

In [3]:
%%capture
!wget https://s3-us-west-1.amazonaws.com/udacity-aind/dog-project/dogImages.zip

In [2]:
%%capture
!unzip dogImages.zip

In [1]:
%%capture
!aws s3 cp dogImages s3://pro-4/dog-images --recursive

<h3>Install and import</h3>

In [2]:
%%capture
!pip install smdebug torch torchvision tqdm

In [3]:
import sagemaker
import boto3
from sagemaker.tuner import CategoricalParameter, ContinuousParameter, HyperparameterTuner
from sagemaker.pytorch import PyTorch
from sagemaker import get_execution_role
from sagemaker.debugger import Rule, DebuggerHookConfig, TensorBoardOutputConfig, CollectionConfig, ProfilerRule, rule_configs
from sagemaker.debugger import ProfilerConfig, FrameworkProfile
import os

<h3>Set up parameters, estimator, and tuner</h3>

In [4]:
hyperparameter_ranges = {
    "learning_rate": ContinuousParameter(0.001, 0.1),
    "batch_size": CategoricalParameter([32, 64, 128, 256, 512]),
}

role = sagemaker.get_execution_role()

objective_metric_name = "Test Loss"
objective_type = "Minimize"
metric_definitions = [{"Name": "Test Loss", "Regex": "Testing Loss: ([0-9\\.]+)"}]

In [5]:
estimator = PyTorch(
    entry_point="hpo.py",
    base_job_name='pytorch_dog_hpo',
    role=role,
    framework_version="1.4.0",
    instance_count=1,
    instance_type="ml.m5.xlarge",
    py_version='py3'
)

tuner = HyperparameterTuner(
    estimator,
    objective_metric_name,
    hyperparameter_ranges,
    metric_definitions,
    max_jobs=2,
    max_parallel_jobs=2,
    objective_type=objective_type
)

<h3>Fit the tuner</h3>

In [7]:
#os.environ['SM_CHANNEL_TRAINING']='s3://my-project-4-bucket/'
#os.environ['SM_MODEL_DIR']='s3://my-project-4-bucket/model/'
#os.environ['SM_OUTPUT_DATA_DIR']='s3://my-project-4-bucket/output/'
#tuner.fit({"training": "s3://my-project-4-bucket/"})

In [8]:
os.environ['SM_CHANNEL_TRAINING']='s3://pro-4/dog-images/'
os.environ['SM_MODEL_DIR']='s3://pro-4/dog-images/model/'
os.environ['SM_OUTPUT_DATA_DIR']='s3://pro-4/dog-images/output/'
tuner.fit({"training": "s3://pro-4/dog-images/"})

........................................................................................................................................................................................................................................................................................................!


<h3>Describe the tuning results</h3>


In [9]:
from sagemaker.analytics import HyperparameterTuningJobAnalytics

exp = HyperparameterTuningJobAnalytics(
  hyperparameter_tuning_job_name='pytorch-training-220212-1728')

jobs = exp.dataframe()

jobs.sort_values('FinalObjectiveValue', ascending=0)

Unnamed: 0,batch_size,learning_rate,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
1,"""32""",0.097414,pytorch-training-220212-1728-001-80445a1d,Completed,151.0,2022-02-12 17:31:15+00:00,2022-02-12 17:51:19+00:00,1204.0
0,"""64""",0.001316,pytorch-training-220212-1728-002-f90b8dec,Completed,113.0,2022-02-12 17:31:27+00:00,2022-02-12 17:51:48+00:00,1221.0


## Imp: If kernel dies, how to continue from a completed training job

In [8]:
#BetterTrainingJobName='pytorch-training-220209-1706'

In [None]:
#my_estimator = sagemaker.estimator.Estimator.attach(BetterTrainingJobName)


In [14]:
#my_estimator.hyperparameters()

In [15]:
#best_estimator=my_estimator

<h3>Prepare to perform Training on Best Estimator</h3>

In [10]:
best_estimator=tuner.best_estimator()


2022-02-12 17:51:48 Starting - Preparing the instances for training
2022-02-12 17:51:48 Downloading - Downloading input data
2022-02-12 17:51:48 Training - Training image download completed. Training in progress.
2022-02-12 17:51:48 Uploading - Uploading generated training model
2022-02-12 17:51:48 Completed - Training job completed


In [11]:
best_estimator.hyperparameters()

{'_tuning_objective_metric': '"Test Loss"',
 'batch_size': '"64"',
 'learning_rate': '0.0013163055386985181',
 'sagemaker_container_log_level': '20',
 'sagemaker_estimator_class_name': '"PyTorch"',
 'sagemaker_estimator_module': '"sagemaker.pytorch.estimator"',
 'sagemaker_job_name': '"pytorch_dog_hpo-2022-02-12-17-28-41-954"',
 'sagemaker_program': '"hpo.py"',
 'sagemaker_region': '"us-east-1"',
 'sagemaker_submit_directory': '"s3://sagemaker-us-east-1-123669583115/pytorch_dog_hpo-2022-02-12-17-28-41-954/source/sourcedir.tar.gz"'}

In [12]:
hyperparameters = {"batch_size": int(best_estimator.hyperparameters()['batch_size'].replace('"', '')), \
                   "learning_rate": best_estimator.hyperparameters()['learning_rate']}
hyperparameters

{'batch_size': 64, 'learning_rate': '0.0013163055386985181'}

In [13]:
rules = [
    Rule.sagemaker(rule_configs.vanishing_gradient()),
    Rule.sagemaker(rule_configs.overfit()),
    Rule.sagemaker(rule_configs.overtraining()),
    Rule.sagemaker(rule_configs.poor_weight_initialization()),
    ProfilerRule.sagemaker(rule_configs.ProfilerReport()),
]

In [14]:
hook_config = DebuggerHookConfig(
    hook_parameters={
        "train.save_interval": "1",
        "eval.save_interval": "1"
    }
)

profiler_config = ProfilerConfig(
    system_monitor_interval_millis=500, framework_profile_params=FrameworkProfile(num_steps=1)
)

<h2>Creating an Estimator</h2>

In [15]:
#adjust this cell to accomplish multi-instance training
estimator = PyTorch(
    entry_point='hpo.py',
    base_job_name='dog-pytorch',
    role=role,
    instance_count=2,
    instance_type='ml.m5.xlarge',
    framework_version='1.4.0',
    py_version='py3',
    hyperparameters=hyperparameters,
    ## Debugger and Profiler parameters
    rules = rules,
    debugger_hook_config=hook_config,
    profiler_config=profiler_config,
)

In [16]:
estimator.fit({"training": "s3://pro-4/dog-images/"}, wait=False)

<h2>Creating an Estimator - Multi-Instance Training,</h2>

In [35]:
###in this cell, create and fit an estimator using multi-instance training


In [17]:
containers = {'us-west-2': '174872318107.dkr.ecr.us-west-2.amazonaws.com/linear-learner:latest',
              'us-east-1': '382416733822.dkr.ecr.us-east-1.amazonaws.com/linear-learner:latest',
              'us-east-2': '404615174143.dkr.ecr.us-east-2.amazonaws.com/linear-learner:latest',
              'eu-west-1': '438346466558.dkr.ecr.eu-west-1.amazonaws.com/linear-learner:latest'}
container = containers[boto3.Session().region_name]

In [18]:
bucket = "pro-4"
prefix = "dog-images"
common_training_params = {
    "RoleArn": role,
    "AlgorithmSpecification": {
        "TrainingImage": container,
        "TrainingInputMode": "File"
    },
    "ResourceConfig": {
        "InstanceCount": 2,
        "InstanceType": "ml.c4.2xlarge",#,#"ml.t3.medium",#"ml.c5.xlarge",#"ml.m4.xlarge",
        "VolumeSizeInGB": 10
    },
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    #"S3Uri": "s3://{}/train/".format(bucket)
                    "S3Uri": "s3://{}/{}/train/".format(bucket, prefix)
                }
            },
            "CompressionType": "None",
            "RecordWrapperType": "None"
        },
        {
            "ChannelName": "validation",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    #"S3Uri": "s3://{}/valid/".format(bucket),
                    "S3Uri": "s3://{}/{}/valid/".format(bucket, prefix),
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
            "CompressionType": "None",
            "RecordWrapperType": "None"
        }

    ],
    "OutputDataConfig": {
        "S3OutputPath": "s3://{}/{}/".format(bucket, prefix)
        #"S3OutputPath": "s3://{}/".format(bucket)
    },
    "HyperParameters": {
        "feature_dim": "25",
        "mini_batch_size": "500",
        "predictor_type": "regressor",
        "epochs": "2",
        "num_models": "32",
        "loss": "absolute_loss"
    },
    "StoppingCondition": {
        "MaxRuntimeInSeconds": 60 * 60
    }
}

In [20]:
print("s3://{}/{}/".format(bucket, prefix))

s3://pro-4/dog-images/


In [22]:
import io
import time
import copy
import json
import sys
import sagemaker.amazon.common as smac
import os

sharded_job = 'pytorch-training-220212-1728-' + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())

print("Job name is:", sharded_job)

sharded_training_params = copy.deepcopy(common_training_params)
sharded_training_params['TrainingJobName'] = sharded_job
sharded_training_params['InputDataConfig'][0]['DataSource']['S3DataSource']['S3DataDistributionType'] = 'ShardedByS3Key'

Job name is: pytorch-training-220212-1728-2022-02-12-17-56-12


In [25]:
replicated_job = 'pytorch-training-220212-1728-' + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())

print("Job name is:", replicated_job)

replicated_training_params = copy.deepcopy(common_training_params)
replicated_training_params['TrainingJobName'] = replicated_job
replicated_training_params['InputDataConfig'][0]['DataSource']['S3DataSource']['S3DataDistributionType'] = 'FullyReplicated'

Job name is: pytorch-training-220212-1728-2022-02-12-17-56-45


In [26]:
%%time

region = boto3.Session().region_name
sm = boto3.Session().client('sagemaker')


#sm.create_training_job(**sharded_training_params)

sm.create_training_job(**replicated_training_params)

status = sm.describe_training_job(TrainingJobName=replicated_job)['TrainingJobStatus']
print(status)
sm.get_waiter('training_job_completed_or_stopped').wait(TrainingJobName=replicated_job)
status = sm.describe_training_job(TrainingJobName=replicated_job)['TrainingJobStatus']
print("Training job ended with status: " + status)
if status == 'Failed':
    message = sm.describe_training_job(TrainingJobName=replicated_job)['FailureReason']
    print('Training failed with the following error: {}'.format(message))
    raise Exception('Training job failed')
    


InProgress


WaiterError: Waiter TrainingJobCompletedOrStopped failed: Waiter encountered a terminal failure state: For expression "TrainingJobStatus" we matched expected path: "Failed"

In [None]:
#print('Sharded:', sm.describe_training_job(TrainingJobName=sharded_job)['TrainingJobStatus'])
print('Replicated:', sm.describe_training_job(TrainingJobName=replicated_job)['TrainingJobStatus'])

<h2>Deployment</h2>

In [61]:
model_location=estimator.model_data

In [28]:
import sagemaker
import boto3
from sagemaker.tuner import CategoricalParameter, ContinuousParameter, HyperparameterTuner
from sagemaker.pytorch import PyTorch
from sagemaker import get_execution_role
from sagemaker.debugger import Rule, DebuggerHookConfig, TensorBoardOutputConfig, CollectionConfig, ProfilerRule, rule_configs
from sagemaker.debugger import ProfilerConfig, FrameworkProfile

from sagemaker.pytorch import PyTorchModel
from sagemaker.predictor import Predictor


In [29]:
jpeg_serializer = sagemaker.serializers.IdentitySerializer("image/jpeg")
json_deserializer = sagemaker.deserializers.JSONDeserializer()


class ImagePredictor(Predictor):
    def __init__(self, endpoint_name, sagemaker_session):
        super(ImagePredictor, self).__init__(
            endpoint_name,
            sagemaker_session=sagemaker_session,
            serializer=jpeg_serializer,
            deserializer=json_deserializer,
        )

In [30]:
pytorch_model = PyTorchModel(model_data=model_location, role=role, entry_point='infernce2.py',py_version='py3',
                             framework_version='1.4',
                             predictor_cls=ImagePredictor)

In [31]:
predictor = pytorch_model.deploy(initial_instance_count=1, instance_type='ml.m5.large')


---------------!

In [None]:
import requests
#request_dict={ "url": "https://cdn1-www.cattime.com/assets/uploads/2011/12/file_2744_british-shorthair-460x290-460x290.jpg" }
request_dict={ "url": "https://s3.amazonaws.com/cdn-origin-etr.akc.org/wp-content/uploads/2017/11/20113314/Carolina-Dog-standing-outdoors.jpg" }

img_bytes = requests.get(request_dict['url']).content
type(img_bytes)

In [None]:
from PIL import Image
import io
Image.open(io.BytesIO(img_bytes))

In [None]:
response=predictor.predict(img_bytes, initial_args={"ContentType": "image/jpeg"})

In [None]:
import json
response2=predictor.predict(json.dumps(request_dict), initial_args={"ContentType": "application/json"})

In [None]:
type(response2[0][0])

In [None]:
response2[0]

In [None]:
import torch
import numpy as np
np.argmax(response, 1)