This notebook is based on [this](https://aws.amazon.com/blogs/machine-learning/preprocess-input-data-before-making-predictions-using-amazon-sagemaker-inference-pipelines-and-scikit-learn/) article and is a refreshed version (deprecations, etc)

In [1]:
# S3 prefix
s3_bucket = 'dans-ml' # update this to your bucket name
prefix = 'Scikit-LinearLearner-pipeline-abalone-example'

In [2]:
import sagemaker
from sagemaker import get_execution_role

sagemaker_session = sagemaker.Session()

# Get a SageMaker-compatible role used by this Notebook Instance.
role = get_execution_role()

In [3]:
!wget --directory-prefix=./abalone_data https://s3-us-west-2.amazonaws.com/sparkml-mleap/data/abalone/abalone.csv

--2020-12-30 06:07:22--  https://s3-us-west-2.amazonaws.com/sparkml-mleap/data/abalone/abalone.csv
Resolving s3-us-west-2.amazonaws.com (s3-us-west-2.amazonaws.com)... 52.218.178.136
Connecting to s3-us-west-2.amazonaws.com (s3-us-west-2.amazonaws.com)|52.218.178.136|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 191873 (187K) [binary/octet-stream]
Saving to: ‘./abalone_data/abalone.csv.3’


2020-12-30 06:07:22 (28.6 MB/s) - ‘./abalone_data/abalone.csv.3’ saved [191873/191873]



In [4]:
WORK_DIRECTORY = 'abalone_data'

train_input = sagemaker_session.upload_data(
    path='{}/{}'.format(WORK_DIRECTORY, 'abalone.csv'), 
    bucket=s3_bucket,
    key_prefix='{}/{}'.format(prefix, 'train'))


In [25]:
from sagemaker.sklearn.estimator import SKLearn

script_path = 'preprocess.py'

sklearn_preprocessor = SKLearn(
    entry_point=script_path,
    role=role,
    framework_version="0.23-1",
    instance_type="ml.c4.xlarge",
    sagemaker_session=sagemaker_session)

sklearn_preprocessor.fit({'train': train_input})

2020-12-30 07:32:00 Starting - Starting the training job...
2020-12-30 07:32:24 Starting - Launching requested ML instancesProfilerReport-1609313520: InProgress
......
2020-12-30 07:33:29 Starting - Preparing the instances for training.........
2020-12-30 07:34:45 Downloading - Downloading input data...
2020-12-30 07:35:26 Training - Downloading the training image..[34m2020-12-30 07:35:42,495 sagemaker-training-toolkit INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2020-12-30 07:35:42,496 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-12-30 07:35:42,506 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2020-12-30 07:35:43,146 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-12-30 07:35:43,160 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-12-30 07:35:43,171 sagemaker-training-t

In [26]:
# Define a SKLearn Transformer from the trained SKLearn Estimator
transformer = sklearn_preprocessor.transformer(
    instance_count=1, 
    instance_type='ml.m4.xlarge',
    assemble_with = 'Line',
    accept = 'text/csv')

In [27]:
# Preprocess training input
transformer.transform(train_input, content_type='text/csv')
print('Waiting for transform job: ' + transformer.latest_transform_job.job_name)
transformer.wait()
preprocessed_train = transformer.output_path

..............................
[34m2020-12-30 07:41:31,160 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2020-12-30 07:41:31,162 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2020-12-30 07:41:31,163 INFO - sagemaker-containers - nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;
[0m
[34mworker_rlimit_nofile 4096;
[0m
[34mevents {
  worker_connections 2048;[0m
[34m}
[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;

  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }

  server {
    listen 8080 deferred;
    client_max_body_size 0;

    keepalive_timeout 3;

    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
      proxy_set_header Host $http_host;
      proxy_redire

In [28]:
import boto3
from sagemaker.image_uris import retrieve
ll_image = retrieve('linear-learner', boto3.Session().region_name)

In [29]:
s3_ll_output_key_prefix = "ll_training_output"
s3_ll_output_location = 's3://{}/{}/{}/{}'.format(s3_bucket, prefix, s3_ll_output_key_prefix, 'll_model')

ll_estimator = sagemaker.estimator.Estimator(
    ll_image,
    role, 
    instance_count=1, 
    instance_type='ml.m4.2xlarge',
    volume_size = 20,
    max_run = 3600,
    input_mode= 'File',
    output_path=s3_ll_output_location,
    sagemaker_session=sagemaker_session)

ll_estimator.set_hyperparameters(feature_dim=10, predictor_type='regressor', mini_batch_size=32)

ll_train_data = sagemaker.inputs.TrainingInput(
    preprocessed_train, 
    distribution='FullyReplicated',
    content_type='text/csv', 
    s3_data_type='S3Prefix')

data_channels = {'train': ll_train_data}
ll_estimator.fit(inputs=data_channels, logs=True)

The class sagemaker.session.s3_input has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


2020-12-30 07:41:53 Starting - Starting the training job...
2020-12-30 07:42:16 Starting - Launching requested ML instancesProfilerReport-1609314113: InProgress
......
2020-12-30 07:43:16 Starting - Preparing the instances for training......
2020-12-30 07:44:17 Downloading - Downloading input data...
2020-12-30 07:44:53 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[12/30/2020 07:44:59 INFO 139634832111424] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'loss_insensitivity': u'0.01', u'epochs': u'15', u'feature_dim': u'auto', u'init_bias': u'0.0', u'lr_scheduler_factor': u'auto', u'num_calibration_samples': u'10000000', u'accuracy_top_k': u'3', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'num_point_for_scaler': u'10000', u'_log_level': u'info', u'quantile': u'0.5', u'b

In [30]:
from sagemaker.model import Model
from sagemaker.pipeline import PipelineModel
import boto3
from time import gmtime, strftime

timestamp_prefix = strftime("%Y-%m-%d-%H-%M-%S", gmtime())

scikit_learn_inferencee_model = sklearn_preprocessor.create_model()
linear_learner_model = ll_estimator.create_model()

model_name = 'inference-pipeline-' + timestamp_prefix
endpoint_name = 'inference-pipeline-ep-' + timestamp_prefix
sm_model = PipelineModel(
    name=model_name, 
    role=role, 
    models=[
        scikit_learn_inferencee_model, 
        linear_learner_model])

sm_model.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge', endpoint_name=endpoint_name)

---------------!

In [31]:
from sagemaker.predictor import Predictor
from sagemaker.serializers import CSVSerializer

payload = 'M, 0.44, 0.365, 0.125, 0.516, 0.2155, 0.114, 0.155'
actual_rings = 10

predictor = Predictor(
    endpoint_name=endpoint_name,
    sagemaker_session=sagemaker_session,
    serializer=CSVSerializer())

print(predictor.predict(payload))

b'{"predictions": [{"score": 9.528051376342773}]}'
