In [1]:
import os
import boto3
import re
import copy
import time
from time import gmtime, strftime
from sagemaker import get_execution_role

role = get_execution_role()

region = boto3.Session().region_name

bucket='keras-sagemaker-train' # Put your s3 bucket name here
# customize to your bucket where you will store data
bucket_path = 'https://s3-{}.amazonaws.com/{}'.format(region,bucket)
print(bucket_path)

https://s3-us-east-1.amazonaws.com/keras-sagemaker-train


In [2]:
%%sh

# The name of our algorithm
algorithm_name=keras-sagemaker-train

chmod +x src/*

account=$(aws sts get-caller-identity --query Account --output text)

# Get the region defined in the current configuration (default to us-west-2 if none defined)
region=$(aws configure get region)
region=${region:-us-west-2}

fullname="${account}.dkr.ecr.${region}.amazonaws.com/${algorithm_name}:latest"

# If the repository doesn't exist in ECR, create it.

aws ecr describe-repositories --repository-names "${algorithm_name}" > /dev/null 2>&1

if [ $? -ne 0 ]
then
    aws ecr create-repository --repository-name "${algorithm_name}" > /dev/null
fi

# Get the login command from ECR and execute it directly
$(aws ecr get-login --region ${region} --no-include-email)

# Build the docker image locally with the image name and then push it to ECR
# with the full name.

# On a SageMaker Notebook Instance, the docker daemon may need to be restarted in order
# to detect your network configuration correctly.  (This is a known issue.)
if [ -d "/home/ec2-user/SageMaker" ]; then
  sudo service docker restart
fi

docker build  -t ${algorithm_name} -f Dockerfile.cpu .
# Comment the above line and uncomment the below line if you wish to run on a GPU
#docker build  -t ${algorithm_name} -f Dockerfile.gpu . 

docker tag ${algorithm_name} ${fullname}

docker push ${fullname}

Login Succeeded
Stopping docker: [  OK  ]
Starting docker:	.[  OK  ]
Sending build context to Docker daemon  292.4kB
Step 1/6 : FROM phenompeople/centos-python:3.6.3
 ---> e3d7d8ca4a30
Step 2/6 : ENV PATH="/opt/program:${PATH}"
 ---> Using cache
 ---> 6a15e41a54a0
Step 3/6 : ADD requirements-cpu.txt /
 ---> Using cache
 ---> bfc923425753
Step 4/6 : RUN pip3 install -r requirements-cpu.txt
 ---> Using cache
 ---> f2091e806962
Step 5/6 : COPY src /opt/program
 ---> Using cache
 ---> 35c77aee8b92
Step 6/6 : WORKDIR /opt/program
 ---> Using cache
 ---> 894523ae9b5d
Successfully built 894523ae9b5d
Successfully tagged keras-sagemaker-train:latest
The push refers to repository [850021735523.dkr.ecr.us-east-1.amazonaws.com/keras-sagemaker-train]
fb4251a33e09: Preparing
82dd33955078: Preparing
b170cb69bfc9: Preparing
952e0784686f: Preparing
65c06ae44bbd: Preparing
f194f1dd3e8f: Preparing
ea264623c568: Preparing
c4cd48200f79: Preparing
bcc97fbfc9e1: Preparing
f194f1dd3e8f: Waiting
ea264623c5

https://docs.docker.com/engine/reference/commandline/login/#credentials-store



In [3]:
data_location = 's3://{}/data'.format(bucket)
print("data location - " + data_location)

output_location = 's3://{}/output'.format(bucket)
print("output location - " + output_location)

data location - s3://keras-sagemaker-train/data
output location - s3://keras-sagemaker-train/output


In [4]:
import sagemaker as sage
sess = sage.Session()

In [5]:
account = sess.boto_session.client('sts').get_caller_identity()['Account']
region = sess.boto_session.region_name
image = '{}.dkr.ecr.{}.amazonaws.com/keras-sagemaker-train'.format(account, region)

In [6]:
hyperparameters = {"batch_size":128, "epochs":30}

In [7]:
classifier = sage.estimator.Estimator(image_name=image, 
                                      role=role,
                                      train_instance_count=1, 
                                      train_instance_type='ml.c5.2xlarge',
                                      hyperparameters=hyperparameters,
                                      output_path=output_location,
                                      sagemaker_session=sess)

In [8]:
classifier.fit(data_location)

2019-06-12 08:34:47 Starting - Starting the training job...
2019-06-12 08:34:48 Starting - Launching requested ML instances......
2019-06-12 08:35:56 Starting - Preparing the instances for training...
2019-06-12 08:36:37 Downloading - Downloading input data
2019-06-12 08:36:37 Training - Downloading the training image....
[31m2019-06-12 08:37:12.363170: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 AVX512F FMA[0m
[31m2019-06-12 08:37:12.392455: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 3000000000 Hz[0m
[31m2019-06-12 08:37:12.393789: I tensorflow/compiler/xla/service/service.cc:150] XLA service 0x3f52440 executing computations on platform Host. Devices:[0m
[31m2019-06-12 08:37:12.393806: I tensorflow/compiler/xla/service/service.cc:158]   StreamExecutor device (0): <undefined>, <undefined>[0m
[31mUsing TensorFlow backend.[0m
[31mInstructions for upda

[31mEpoch 18/30

 128/8000 [..............................] - ETA: 0s - loss: 0.2149 - acc: 0.9297[0m
[31m1024/8000 [==>...........................] - ETA: 0s - loss: 0.2398 - acc: 0.9209[0m
[31mEpoch 19/30

 128/8000 [..............................] - ETA: 0s - loss: 0.2917 - acc: 0.9219[0m
[31m1152/8000 [===>..........................] - ETA: 0s - loss: 0.2325 - acc: 0.9219[0m
[31mEpoch 20/30

 128/8000 [..............................] - ETA: 0s - loss: 0.1857 - acc: 0.9375[0m
[31m1152/8000 [===>..........................] - ETA: 0s - loss: 0.2174 - acc: 0.9280[0m
[31mEpoch 21/30

 128/8000 [..............................] - ETA: 0s - loss: 0.2034 - acc: 0.9141[0m
[31m1152/8000 [===>..........................] - ETA: 0s - loss: 0.1792 - acc: 0.9410[0m
[31mEpoch 22/30
[0m
[31m 128/8000 [..............................] - ETA: 0s - loss: 0.3044 - acc: 0.8984[0m
[31m1152/8000 [===>..........................] - ETA: 0s - loss: 0.2028 - acc: 0.9384[0m
[31mEpoch 23/30


2019-06-12 08:37:40 Uploading - Uploading generated training model
2019-06-12 08:37:40 Completed - Training job completed
Billable seconds: 70
