# Save sample data for testing

In [21]:
import boto3
import os
from tqdm import tqdm

s3 = boto3.client("s3")

# List all objects helper internal function
def list_all_objects(bucket, prefix):
    # Create a paginator for list_objects_v2
    paginator = s3.get_paginator('list_objects_v2')

    # Use the paginator to iterate through all pages
    all_objects = []
    for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
        if 'Contents' in page:
            all_objects.extend(page['Contents'])

    return all_objects

bucket = 'udacity-deeplearning-project'
prefix = 'data'
sample_prefix = 'sample'

all_files = list_all_objects(bucket, prefix)

sample_files = [file_meta for file_meta in all_files if '001.Affenpinscher' in file_meta.get("Key")]

for file_meta in tqdm(sample_files):
    key = file_meta.get("Key")
    
    # Move the data from s3 to a different prefix
    dirname = os.path.dirname(key)
    if dirname:
        os.makedirs(dirname, exist_ok=True)

    s3.download_file(bucket, key, key)
    # s3.copy_object(
    #     Bucket=bucket,
    #     CopySource=f"{bucket}/{key}",
    #     Key=f"{sample_prefix}/{key}"
    # )

100%|██████████| 80/80 [00:06<00:00, 12.80it/s]


## Testing Estimator Locally Prior to Deployment to ECR

In [22]:
from sagemaker.estimator import Estimator
from sagemaker import get_execution_role
import sagemaker
import subprocess

subprocess.run(['sh', 'docker-build.sh'])

role=get_execution_role()

bucket = 'udacity-deeplearning-project'
sample_prefix = 'sample'

hyperparameters = {
    'num-classes': 133,
    'batch-size': 32,
    'lr': 0.005070970373087015
}

bucket = 'udacity-deeplearning-project'
s3_output_location = f"s3://{bucket}/outputs"

estimator=Estimator(
    image_uri='udacity-sagemaker-hpo',
    role=role,
    instance_count=1,
    instance_type='local',
    output_path=s3_output_location,
    hyperparameters=hyperparameters
)

model_inputs = {
    "train": "file://./data/test",
    "test": "file://./data/valid"
}

estimator.fit(inputs=model_inputs, logs=True ,wait=True)

https://docs.docker.com/engine/reference/commandline/login/#credentials-store



Login Succeeded


INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


sha256:6e54afc9f27d48bdfc45a27f9662fa1a075ae00ad672a43792131965265c53f5
REPOSITORY                                                           TAG       IMAGE ID       CREATED                  SIZE
udacity-sagemaker-hpo                                                latest    6e54afc9f27d   Less than a second ago   3.92GB
598308907998.dkr.ecr.us-east-1.amazonaws.com/udacity-sagemaker-hpo   latest    a351c2adb476   33 minutes ago           3.92GB
<none>                                                               <none>    6006234684a7   34 minutes ago           3.92GB
<none>                                                               <none>    fba9a75aee2f   37 minutes ago           3.92GB


INFO:sagemaker:Creating training-job with name: udacity-sagemaker-hpo-2024-11-24-20-19-25-597
INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker.local.image:'Docker Compose' is not installed. Proceeding to check for 'docker-compose' CLI.
INFO:sagemaker.local.image:'Docker Compose' found using Docker Compose CLI.
INFO:sagemaker.local.local_session:Starting training job
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:sagemaker.local.image:No AWS credentials found in session but credentials from EC2 Metadata Service are available.
INFO:sagemaker.local.i

 Container m4yq0lzjnr-algo-1-0hih5  Creating
 Container m4yq0lzjnr-algo-1-0hih5  Created
Attaching to m4yq0lzjnr-algo-1-0hih5
m4yq0lzjnr-algo-1-0hih5  | sed: can't read changehostname.c: No such file or directory
m4yq0lzjnr-algo-1-0hih5  | [01m[Kgcc:[m[K [01;31m[Kerror: [m[Kchangehostname.c: No such file or directory
m4yq0lzjnr-algo-1-0hih5  | [01m[Kgcc:[m[K [01;31m[Kfatal error: [m[Kno input files
m4yq0lzjnr-algo-1-0hih5  | compilation terminated.
m4yq0lzjnr-algo-1-0hih5  | [01m[Kgcc:[m[K [01;31m[Kerror: [m[Kchangehostname.o: No such file or directory
m4yq0lzjnr-algo-1-0hih5  | ERROR: ld.so: object '/libchangehostname.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
m4yq0lzjnr-algo-1-0hih5  | 2024-11-24 20:19:26,432 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
m4yq0lzjnr-algo-1-0hih5  | 2024-11-24 20:19:26,435 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus 

INFO:sagemaker.local.image:===== Job Complete =====


## Testing Deployed Estimator

In [23]:
!sh push-container.sh

push-container.sh: line 1: fg: no job control
https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded
[1A[1B[0G[?25l[+] Building 0.0s (0/1)                                          docker:default
[?25h[1A[0G[?25l[+] Building 0.1s (1/1) FINISHED                                 docker:default
[34m => [internal] load build definition from Dockerfile                       0.0s
[0m[34m => => transferring dockerfile: 2B                                         0.0s
[0m[?25hERROR: failed to solve: failed to read dockerfile: open Dockerfile: no such file or directory
The push refers to repository [598308907998.dkr.ecr.us-east-1.amazonaws.com/udacity-sagemaker-hpo]

[1Bbf18a086: Preparing 
[1Bc6a5eeb1: Preparing 
[1B0a42ae7f: Preparing 
[1B38d8788c: Preparing 
[1B95045e04: Preparing 
[1Be9708ca1: Preparing 
[1B8f6060c6: Preparing 
[1Ba3c12226: Preparing 
[1B62daa95e: Preparing 
[1B8fe1cb59: Preparing 
[1B061a5b0d: Preparing 
[1B3ff1

In [24]:
model_inputs = {
    "train": sagemaker.inputs.TrainingInput(
        s3_data=f"s3://{bucket}/{sample_prefix}/data/train/",
        content_type="application/x-image"
    ),
    "test": sagemaker.inputs.TrainingInput(
        s3_data=f"s3://{bucket}/{sample_prefix}/data/valid/",
        content_type="application/x-image"
    )
}

In [25]:
hyperparameters = {
    'num-classes': 133,
    'batch-size': 32,
    'lr': 0.005070970373087015
}

In [26]:
estimator=Estimator(
    image_uri='598308907998.dkr.ecr.us-east-1.amazonaws.com/udacity-sagemaker-hpo',
    role=role,
    instance_count=1,
    instance_type='ml.m5.xlarge',
    hyperparameters=hyperparameters
)

estimator.fit(inputs=model_inputs, logs=True)

INFO:sagemaker:Creating training-job with name: udacity-sagemaker-hpo-2024-11-24-20-20-00-150


2024-11-24 20:20:00 Starting - Starting the training job...
2024-11-24 20:20:27 Starting - Preparing the instances for training...
2024-11-24 20:21:05 Downloading - Downloading the training image......
2024-11-24 20:21:46 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34msed: can't read changehostname.c: No such file or directory[0m
[34mgcc: error: changehostname.c: No such file or directory[0m
[34mgcc: fatal error: no input files[0m
[34mcompilation terminated.[0m
[34mgcc: error: changehostname.o: No such file or directory[0m
[34mERROR: ld.so: object '/libchangehostname.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.[0m
[34m2024-11-24 20:21:52,907 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2024-11-24 20:21:52,909 sagemaker-training-t

In [27]:
sagemaker_client = boto3.client("sagemaker")

# Get the latest training job name
training_job_name = estimator.latest_training_job.name
print(f"Training Job Name: {training_job_name}")

# Get the model artifact location from the training job details
response = sagemaker_client.describe_training_job(TrainingJobName=training_job_name)
model_artifact = response["ModelArtifacts"]["S3ModelArtifacts"]
print(f"Model Artifact Location: {model_artifact}")


Training Job Name: udacity-sagemaker-hpo-2024-11-24-20-20-00-150
Model Artifact Location: s3://sagemaker-us-east-1-598308907998/udacity-sagemaker-hpo-2024-11-24-20-20-00-150/output/model.tar.gz


# Testing Batch Inference

In [28]:
from sagemaker.pytorch import PyTorchModel

pytorch_model = PyTorchModel(
    model_data=model_artifact,  # Use the artifact location from the training job
    role=estimator.role,       # Use the role from your estimator
    entry_point="image/inference.py",
    framework_version="1.8.1",
    py_version="py3",
    image_uri="763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-inference:1.8.1-cpu-py36-ubuntu18.04",  # PyTorch-Inference image
)


In [29]:
transformer = pytorch_model.transformer(
    instance_count=1,
    instance_type="ml.m5.large",
    output_path="s3://udacity-deeplearning-project/inference/",  # S3 location for output
)


INFO:sagemaker:Repacking model artifact (s3://sagemaker-us-east-1-598308907998/udacity-sagemaker-hpo-2024-11-24-20-20-00-150/output/model.tar.gz), script artifact (None), and dependencies ([]) into single tar.gz file located at s3://sagemaker-us-east-1-598308907998/pytorch-inference-2024-11-24-20-26-25-124/model.tar.gz. This may take some time depending on model size...
INFO:sagemaker:Creating model with name: pytorch-inference-2024-11-24-20-26-29-479


In [30]:
transformer.transform(
    data="s3://udacity-deeplearning-project/sample/data/batch/",
    content_type="application/x-image",
    wait=True
)

print("Batch transform completed. Check the output in the specified S3 bucket.")

INFO:sagemaker:Creating transform job with name: pytorch-inference-2024-11-24-20-26-32-239


..............................[34m2024-11-24 20:31:35,030 [INFO ] main org.pytorch.serve.servingsdk.impl.PluginsManager - Initializing plugins manager...[0m
[34m2024-11-24 20:31:35,237 [INFO ] main org.pytorch.serve.ModelServer - [0m
[34mTorchserve version: 0.4.0[0m
[34mTS Home: /opt/conda/lib/python3.6/site-packages[0m
[34mCurrent directory: /[0m
[34mTemp directory: /home/model-server/tmp[0m
[34mNumber of GPUs: 0[0m
[34mNumber of CPUs: 2[0m
[34mMax heap size: 952 M[0m
[34mPython executable: /opt/conda/bin/python3.6[0m
[34mConfig file: /etc/sagemaker-ts.properties[0m
[34mInference address: http://0.0.0.0:8080[0m
[34mManagement address: http://0.0.0.0:8080[0m
[34mMetrics address: http://127.0.0.1:8082[0m
[34mModel Store: /.sagemaker/ts/models[0m
[34mInitial Models: model.mar[0m
[34mLog dir: /logs[0m
[34mMetrics dir: /logs[0m
[34mNetty threads: 0[0m
[34mNetty client threads: 0[0m
[34mDefault workers per model: 2[0m
[34mBlacklist Regex: N/A[0m


# END OF NOTEBOOK