In [None]:
base_dir='/tmp/clustering'
dataset_dir='https://workshopml.spock.cloud/datasets/furniture'

In [None]:
import sagemaker
from sagemaker import get_execution_role

role = get_execution_role()

sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()

In [None]:
%%time
import os
!mkdir -p $base_dir
if not os.path.exists(base_dir + '/furniture.raw.json'):
    !curl $dataset_dir/furniture.raw.json -o $base_dir/furniture.raw.json

In [None]:
import pandas as pd

dataset = pd.read_json(base_dir + '/furniture.raw.json')
dataset[[ 'id', 'raw_hash']].head()

In [None]:
labels = dataset['id'].values.tolist()
hashes = dataset['raw_hash'].values.tolist()

train_set = np.array(hashes, dtype='float32')
labels_set = np.array(labels)

In [None]:
from sagemaker import KMeans

data_location = 's3://{}/clustering/data'.format(bucket)
output_location = 's3://{}/clustering/output'.format(bucket)

print('training data will be uploaded to: {}'.format(data_location))
print('training artifacts will be uploaded to: {}'.format(output_location))

kmeans = KMeans(role=role,
                train_instance_count=1,
                train_instance_type='ml.c4.8xlarge',
                output_path=output_location,
                k=100,
                data_location=data_location)

In [None]:
%%time

kmeans.fit(kmeans.record_set(train_set))

In [None]:
%%time

kmeans_predictor = kmeans.deploy(initial_instance_count=1,
                                 instance_type='ml.m4.xlarge')

In [None]:
%%time
result = kmeans_predictor.predict(np.array([ train_set[1000] ], dtype='float32') )
clusters = [r.label['closest_cluster'].float32_tensor.values[0] for r in result]
print(clusters)

In [None]:
print(kmeans_predictor.endpoint)

In [None]:
import sagemaker
sagemaker.Session().delete_endpoint(kmeans_predictor.endpoint)