# Import libraries

In [16]:
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker import KMeans
import pandas as pd
import io
from sklearn.preprocessing import MinMaxScaler
import argparse
import os
import warnings
from sagemaker.processing import ProcessingInput, ProcessingOutput
import numpy as np

# Set basic config

In [2]:
region = boto3.session.Session().region_name
role = get_execution_role()

s3 = boto3.client('s3')

BUCKET = 'prabhat-ml' # replace with your bucket name
KEY = 'diabetes/blog_synthetic/data/diabetes_data.csv' # replace with your object key

input_data = 's3://' + BUCKET + '/' + KEY

# Preprocess data

In [3]:
sklearn_processor = SKLearnProcessor(framework_version='0.20.0',
                                     role=role,
                                     instance_type='ml.m5.xlarge',
                                     instance_count=1)

In [4]:
%%time

from sagemaker.processing import ProcessingInput, ProcessingOutput

sklearn_processor.run(code='./preprocessing.py',
                      inputs=[ProcessingInput(
                        source=input_data,
                        destination='/opt/ml/processing/input')],
                      outputs=[ProcessingOutput(output_name='train_data',
                                                source='/opt/ml/processing/train'),
                               ProcessingOutput(output_name='scaler',
                                                source='/opt/ml/processing/scaler')], # We will also use this to transform inference data
                     )


Job Name:  sagemaker-scikit-learn-2020-05-25-21-16-53-909
Inputs:  [{'InputName': 'input-1', 'S3Input': {'S3Uri': 's3://prabhat-ml/diabetes/blog_synthetic/data/diabetes_data.csv', 'LocalPath': '/opt/ml/processing/input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'S3Input': {'S3Uri': 's3://sagemaker-us-east-2-107995894928/sagemaker-scikit-learn-2020-05-25-21-16-53-909/input/code/preprocessing.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'train_data', 'S3Output': {'S3Uri': 's3://sagemaker-us-east-2-107995894928/sagemaker-scikit-learn-2020-05-25-21-16-53-909/output/train_data', 'LocalPath': '/opt/ml/processing/train', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'scaler', 'S3Output': {'S3Uri': 's3://sagemaker-us-east-2-10799589492

In [5]:
preprocessing_job_description = sklearn_processor.jobs[-1].describe()

output_config = preprocessing_job_description['ProcessingOutputConfig']
for output in output_config['Outputs']:
    if output['OutputName'] == 'train_data':
        preprocessed_training_data = output['S3Output']['S3Uri']
    if output['OutputName'] == 'scaler':
        preprocessed_scaler_data = output['S3Output']['S3Uri']

# Get preprocessed (transformed) training data

In [6]:
print(preprocessed_training_data)
print(preprocessed_scaler_data)

s3://sagemaker-us-east-2-107995894928/sagemaker-scikit-learn-2020-05-25-21-16-53-909/output/train_data
s3://sagemaker-us-east-2-107995894928/sagemaker-scikit-learn-2020-05-25-21-16-53-909/output/scaler


In [7]:
# Get the bucket name and get the Key name from the transformed data
BUCKET = preprocessed_training_data[5:preprocessed_training_data.find('/',5)]
KEY = preprocessed_training_data[len(BUCKET)+6:len(preprocessed_training_data)]+ '/diabetes_data_transformed.csv'

In [9]:
response = s3.get_object(Bucket=BUCKET, Key=KEY)
response_body = response["Body"].read()
df = pd.read_csv(io.BytesIO(response_body), header=None, delimiter=",", low_memory=False)

# Configure the Sagemaker KMeans estimator

In [13]:
output_location = 's3://prabhat-ml/diabetes/blog_synthetic/output'    # place to store the generated model

kmeans = KMeans(role=role,
                train_instance_count=1,
                train_instance_type='ml.c5.2xlarge',
                output_path=output_location,
                k=5
               )

# Train the model

In [17]:
%%time

train_data = df.to_numpy()
train_data = np.float32(train_data) # alogorithm expects float32
kmeans.fit(kmeans.record_set(train_data))

2020-05-25 21:24:48 Starting - Starting the training job...
2020-05-25 21:24:50 Starting - Launching requested ML instances......
2020-05-25 21:25:55 Starting - Preparing the instances for training......
2020-05-25 21:27:11 Downloading - Downloading input data
2020-05-25 21:27:11 Training - Downloading the training image..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[05/25/2020 21:27:26 INFO 140341438072640] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'_enable_profiler': u'false', u'_tuning_objective_metric': u'', u'_num_gpus': u'auto', u'local_lloyd_num_trials': u'auto', u'_log_level': u'info', u'_kvstore': u'auto', u'local_lloyd_init_method': u'kmeans++', u'force_dense': u'true', u'epochs': u'1', u'init_method': u'random', u'local_lloyd_tol': u'0.0001', u'local_lloyd_max_iter': u'300', u'_disable_wait_to_read': u'false', u'extra_center_f

# Deploy the model - create an inference endpoint

In [18]:
%%time
kmeans_predictor = kmeans.deploy(initial_instance_count=1, 
                                 instance_type='ml.t2.medium')

---------------!CPU times: user 248 ms, sys: 9.98 ms, total: 258 ms
Wall time: 7min 32s


# Initiate sample inference

In [20]:
test_sample = [291, 97, 41, 9, 0.82, 22.56]

# Download scaler for transforming the inference data

In [37]:
# Get the bucket name and get the Key name from the transformed data
BUCKET = preprocessed_scaler_data[5:preprocessed_training_data.find('/',5)]
KEY = preprocessed_scaler_data[len(BUCKET)+6:len(preprocessed_scaler_data)]+ '/scaler.gz'

s3.download_file(BUCKET, KEY, 'scaler.gz')

# Load the scaler

In [39]:
from sklearn.externals import joblib
scaler = joblib.load("scaler.gz")

# Transform inference data

In [40]:
t_data = np.array(test_sample).reshape(1,6)
t_data = np.reshape(t_data, (1,-1))
t_data = scaler.transform(t_data)
t_data = np.float32(t_data)

In [41]:
t_data

array([[0.758517  , 0.64285713, 0.6666667 , 0.44288224, 0.0026005 ]],
      dtype=float32)

In [42]:
result = kmeans_predictor.predict(t_data)

In [43]:
result

[label {
   key: "closest_cluster"
   value {
     float32_tensor {
       values: 0.0
     }
   }
 }
 label {
   key: "distance_to_cluster"
   value {
     float32_tensor {
       values: 0.15986494719982147
     }
   }
 }]