### EXPERIMENTAL

In [1]:
base_dir='/tmp/clustering'
dataset_dir='https://spock.cloud/ai-workshop/furniture'

In [2]:
import sagemaker
from sagemaker import get_execution_role

role = get_execution_role()

sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()

INFO:sagemaker:Created S3 bucket: sagemaker-us-east-1-715445047862


In [3]:
%%time
import os
!mkdir -p $base_dir
if not os.path.exists(base_dir + '/furniture.raw.json'):
    !curl $dataset_dir/furniture.raw.json -o $base_dir/furniture.raw.json

CPU times: user 0 ns, sys: 12 ms, total: 12 ms
Wall time: 110 ms


In [4]:
import pandas as pd
import numpy as np

dataset = pd.read_json(base_dir + '/furniture.raw.json')
dataset[[ 'id', 'raw_hash', 'image_path']].head()

Unnamed: 0,id,raw_hash,image_path
0,0,"[0.6901253461837761, 0.564547598361969, 0.7025...",moveis/sofas/image_00000.jpg
1,1,"[0.5431836843490601, 0.567444086074829, 0.7415...",moveis/sofas/image_00001.jpg
2,2,"[0.647202372550964, 0.6822489500045771, 0.7340...",moveis/sofas/image_00002.jpg
3,3,"[0.5275084972381591, 0.617555320262908, 0.6901...",moveis/sofas/image_00003.jpg
4,4,"[0.5264348387718201, 0.5977515578269951, 0.604...",moveis/sofas/image_00004.jpg


In [5]:
labels = dataset['id'].values.tolist()
hashes = dataset['raw_hash'].values.tolist()
images = dataset['image_path'].values.tolist()

train_set = np.array(hashes, dtype='float32')
labels_set = np.array(labels)

In [7]:
from sagemaker import KMeans

data_location = 's3://{}/clustering/data'.format(bucket)
output_location = 's3://{}/clustering/output'.format(bucket)

print('training data will be uploaded to: {}'.format(data_location))
print('training artifacts will be uploaded to: {}'.format(output_location))

kmeans = KMeans(role=role,
                train_instance_count=1,
                train_instance_type='ml.c4.8xlarge',
                output_path=output_location,
                k=100,
                data_location=data_location)

training data will be uploaded to: s3://sagemaker-us-east-1-715445047862/clustering/data
training artifacts will be uploaded to: s3://sagemaker-us-east-1-715445047862/clustering/output


In [8]:
%%time

kmeans.fit(kmeans.record_set(train_set))

INFO:sagemaker:Creating training-job with name: kmeans-2018-06-05-22-39-56-119


.....................
[31mDocker entrypoint called with argument(s): train[0m
[31m[06/05/2018 22:43:14 INFO 140084259960640] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/default-input.json: {u'_tuning_objective_metric': u'', u'_num_gpus': u'auto', u'local_lloyd_num_trials': u'auto', u'_log_level': u'info', u'_kvstore': u'auto', u'local_lloyd_init_method': u'kmeans++', u'force_dense': u'true', u'epochs': u'1', u'init_method': u'random', u'local_lloyd_tol': u'0.0001', u'local_lloyd_max_iter': u'300', u'_disable_wait_to_read': u'false', u'extra_center_factor': u'auto', u'eval_metrics': u'["msd"]', u'_num_kv_servers': u'1', u'mini_batch_size': u'5000', u'half_life_time_size': u'0', u'_num_slices': u'1'}[0m
[31m[06/05/2018 22:43:14 INFO 140084259960640] Reading provided configuration from /opt/ml/input/config/hyperparameters.json: {u'feature_dim': u'2048', u'mini_batch_size': u'5000', u'k': u'100', u'force_dense': u'True'}[0m
[31m[06/05/2018 22

===== Job Complete =====
Billable seconds: 106
CPU times: user 2.75 s, sys: 196 ms, total: 2.94 s
Wall time: 3min 45s


In [9]:
%%time

kmeans_predictor = kmeans.deploy(initial_instance_count=1,
                                 instance_type='ml.m4.xlarge')

INFO:sagemaker:Creating model with name: kmeans-2018-06-05-22-43-38-477
INFO:sagemaker:Creating endpoint with name kmeans-2018-06-05-22-39-56-119


--------------------------------------------------------------------------!CPU times: user 252 ms, sys: 20 ms, total: 272 ms
Wall time: 6min 14s


In [10]:
endpoint_name=kmeans_predictor.endpoint

### Checking the endpont

In [11]:
%%time
result = kmeans_predictor.predict(np.array([ train_set[0] ], dtype='float32') )
print(result)

[label {
  key: "closest_cluster"
  value {
    float32_tensor {
      values: 26.0
    }
  }
}
label {
  key: "distance_to_cluster"
  value {
    float32_tensor {
      values: 2.816901922225952
    }
  }
}
]
CPU times: user 12 ms, sys: 0 ns, total: 12 ms
Wall time: 355 ms


### Testing

In [12]:
!mkdir -p $base_dir/furniture
!curl $dataset_dir/furniture.tar.gz | tar -xz -C $base_dir/furniture

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  285M  100  285M    0     0  57.2M      0  0:00:04  0:00:04 --:--:-- 64.5M


In [38]:
import boto3
import json
from sagemaker.predictor import json_serializer, json_deserializer

sm = boto3.client('sagemaker-runtime')    

In [68]:
def get_cluster(features):
    result = sm.invoke_endpoint(
        EndpointName=endpoint_name,
        ContentType='application/json',
        Body=json_serializer({"instances": [ {"features": i.tolist() } ] })
    )
    body = json.loads(result['Body'].read())
    body = body['predictions'][0]
    return int(body['closest_cluster']), body['distance_to_cluster']

In [73]:
%%time

images_inside_clusters = {}
for img_id,i in enumerate(train_set):
    cluster_id, distance = get_cluster(i)
    
    if images_inside_clusters.get(cluster_id) is None:
        images_inside_clusters[cluster_id] = []
    images_inside_clusters[cluster_id].append( img_id )

CPU times: user 25.2 s, sys: 364 ms, total: 25.6 s
Wall time: 2min 35s


In [83]:
!curl -s https://workshopml.spock.cloud/images/return.png -O

In [84]:
!curl -s https://workshopml.spock.cloud/js/tagcanvas.min.js -O

In [93]:
import base64
from PIL import Image
from io import BytesIO

def get_image_base64(filename):
    image = Image.open( filename )
    image = image.convert("RGBA")
    newData = []
    for item in image.getdata():
        if item[0] >= 253 and item[1] >= 253 and item[2] >= 253:
            newData.append((255, 255, 255, 0))
        else:
            newData.append(item)

    image.putdata(newData)
    
    buffered = BytesIO()
    image.save(buffered, format="PNG")
    return base64.b64encode(buffered.getvalue()).decode('ascii')
    

In [86]:
import random
from IPython.display import HTML, Javascript, display_javascript

def load_cluster(cluster_id=None):
    ids_images = []
    clusters = []
    furniture_links = ""
    if cluster_id is None:
        clusters = random.sample(range(0,100), 6)
        ids_images = [images_inside_clusters[i][0] for i in clusters]   
    else:
        clusters = [cluster_id for i in range(10)]
        random.shuffle(images_inside_clusters[cluster_id])
        ids_images = [i for i in images_inside_clusters[cluster_id][0:10]]
        furniture_links = '<li><a onclick="loadCluster()" href="#"><img width="10%" height="10%" src="return.png"></img>RETURN</a></li>'
    
    for c,i in zip(clusters, ids_images):
        furniture_links += '<li><a onclick="loadCluster({0})" href="#">'.format(c)
        furniture_links += '<img width="30%" height="30%" src="data:image/png;base64,{0}"></img>'.format(get_image_base64('/tmp/visual/furniture/%s'% images[i] ) )
        furniture_links += 'Cluster: {0}</a></li>'.format(c)

    return furniture_links

In [87]:
HTML('''
<script type="text/javascript" src="tagcanvas.min.js"></script>

<div id="myCanvasContainer">
 <canvas width="600" height="600" id="myCanvas">
  <p>Anything in here will be replaced on browsers that support the canvas element</p>
  <ul id='tags'></ul>
 </canvas>
</div>

<script type="text/javascript">

    function handle_output(out) {
        console.log(out)
        
        document.getElementById('tags').innerHTML = out.content.data["text/plain"];
        
        TagCanvas.Start('myCanvas', 'myCanvasContainer', {
            textColour: '#000000',
            outlineColour: '#ff00ff',
            reverse: true,
            depth: 0.8,
            maxSpeed: 0.05,
            imageMode: 'both',
            imagePosition: 'bottom',
            wheelZoom: false
        });

    }

    function loadCluster(id) {
        id = id == undefined ? "":id;
        var kernel = IPython.notebook.kernel;
        var callbacks = { 'iopub' : {'output' : handle_output}};
        //console.log(id)
        resp = kernel.execute("load_cluster(" + id + ")", callbacks, {silent:false});
    }
</script>

''')

In [94]:
jso = Javascript("loadCluster();")
display_javascript(jso)

### Delete the endpoint

In [None]:
import sagemaker
sagemaker.Session().delete_endpoint(kmeans_predictor.endpoint)