# Required libraries

In [1]:
from sagemaker import get_execution_role
from sagemaker.session import Session
from sagemaker import KMeans
import io
import pandas as pd
import boto3
import numpy as np

# Get the data

In [None]:
s3 = boto3.client('s3')

BUCKET_NAME = 'prabhat-ml-virginia' # replace with your bucket name
KEY = 'diabetes/blog_synthetic/data/diabetes_data.csv' # replace with your object key

role = get_execution_role()

response = s3.get_object(Bucket=BUCKET_NAME, Key=KEY)
response_body = response["Body"].read()
df = pd.read_csv(io.BytesIO(response_body), header=0, delimiter=",", low_memory=False)

# Configure the estimator

In [126]:
output_location = 's3://prabhat-ml/diabetes/blog_synthetic/output'    # place to store the generated model

kmeans = KMeans(role=role,
                train_instance_count=1,
                train_instance_type='ml.c5.large',
                output_path=output_location,
                k=5
               )

# Normalize data

We need to standardize the scaling of the numerical columns in order to use any distance based analytical methods so that we can compare the relative distances between different feature columns.

In [127]:
df1 = (df - df.mean())/df.std()

# Train the model

In [97]:
train_data = df1.to_numpy()
train_data = np.float32(train_data) # alogorithm expects float32
kmeans.fit(kmeans.record_set(train_data))

2020-05-16 05:19:20 Starting - Starting the training job...
2020-05-16 05:19:21 Starting - Launching requested ML instances.........
2020-05-16 05:20:52 Starting - Preparing the instances for training...
2020-05-16 05:21:40 Downloading - Downloading input data...
2020-05-16 05:22:21 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[05/16/2020 05:22:22 INFO 140425388091200] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'_enable_profiler': u'false', u'_tuning_objective_metric': u'', u'_num_gpus': u'auto', u'local_lloyd_num_trials': u'auto', u'_log_level': u'info', u'_kvstore': u'auto', u'local_lloyd_init_method': u'kmeans++', u'force_dense': u'true', u'epochs': u'1', u'init_method': u'random', u'local_lloyd_tol': u'0.0001', u'local_lloyd_max_iter': u'300', u'_disable_wait_to_read'

# Deploy the model - create an inference endpoint

In [22]:
%%time
kmeans_predictor = kmeans.deploy(initial_instance_count=1, 
                                 instance_type='ml.t2.medium')

Using already existing model: kmeans-2020-05-16-03-23-14-106


-------------------!CPU times: user 300 ms, sys: 12.2 ms, total: 312 ms
Wall time: 9min 32s


# Do sample inference

In [103]:
data_mean = df.mean().to_numpy().tolist()
data_std = df.std().to_numpy().tolist()
print(data_mean)
print(data_std)

[498.907, 50.0303, 29.4696, 2.289492000000002, 74.49937000000021]
[289.87433115711855, 28.657887384818498, 11.498821597442676, 1.306044393246921, 526.7579254124425]


In [95]:
test_sample = [758,64,36,2.83,50.28]
test_a1 = np.array(test_sample).reshape(1,5)
test_a1 = (test_sample - df.mean().to_numpy())/df.std().to_numpy()
test_a1 = np.float32(test_a1)
test_a1

array([ 0.89381146,  0.4874644 ,  0.5679191 ,  0.41385117, -0.04597818],
      dtype=float32)

In [104]:
result = kmeans_predictor.predict(test_a1)

In [129]:
cluster_to_which_data_belongs = result[0].label['closest_cluster'].float32_tensor.values[0]

In [130]:
cluster_to_which_data_belongs

1.0