# Clustering images for automatic classification

A given company has lots of unclassified images stored in their servers. They want to use these images for creating a UI where their final customers would navigate through their products by selecting pictues. So, in this notebook we will use an unsupervised algorithm for clustering, called K-Means and then we will select sample from each cluster and send it to rekognition to extract the top-5 tags.

- For each image in our dataset, we need to pass through the Visual Search CNN and get the image feature encoding vector
- Then, we will train a K-Means model with the vectors
- After that we will get a sample of each cluster and then call Rekognition for tagging each cluster

In [None]:
base_dir='/tmp/clustering'
dataset_dir='https://spock.cloud/ai-workshop/furniture'

In [None]:
import sagemaker
from sagemaker import get_execution_role

role = get_execution_role()

# Retrieve the default bucket
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()

## Download the dataset

In [None]:
%%time
import os
!mkdir -p $base_dir
if not os.path.exists(base_dir + '/furniture.raw.json'):
    !curl $dataset_dir/furniture.raw.json -o $base_dir/furniture.raw.json

## Visualize the dataset

In [None]:
import pandas as pd
import numpy as np

dataset = pd.read_json(base_dir + '/furniture.raw.json')
dataset[[ 'id', 'raw_hash', 'image_path']].head()

## Prepare the dataset for the built-in K-Means

In [None]:
labels = dataset['id'].values.tolist()
hashes = dataset['raw_hash'].values.tolist()
images = dataset['image_path'].values.tolist()

train_set = np.array(hashes, dtype='float32')
labels_set = np.array(labels)
num_clusters = 100

## Download the images for the UI

In [None]:
!mkdir -p $base_dir/furniture
!curl $dataset_dir/furniture.tar.gz | tar -xz -C $base_dir/furniture

## Training the model

In [None]:
from sagemaker import KMeans

data_location = 's3://{}/clustering/data'.format(bucket)
output_location = 's3://{}/clustering/output'.format(bucket)

print('training data will be uploaded to: {}'.format(data_location))
print('training artifacts will be uploaded to: {}'.format(output_location))

kmeans = KMeans(role=role,
                train_instance_count=1,
                train_instance_type='ml.c4.8xlarge',
                output_path=output_location,
                k=num_clusters,
                data_location=data_location)

In [None]:
%%time

kmeans.fit(kmeans.record_set(train_set))

In [None]:
%%time

kmeans_predictor = kmeans.deploy(initial_instance_count=1,
                                 instance_type='ml.m4.xlarge')

In [None]:
endpoint_name=kmeans_predictor.endpoint

## Checking the endpont

In [None]:
%%time
result = kmeans_predictor.predict(np.array([ train_set[0] ], dtype='float32') )
print(result)

## Testing

In [None]:
import boto3
import json
import random
import base64
import os

from PIL import Image
from io import BytesIO
from IPython.display import HTML, Javascript, display_javascript
from sagemaker.predictor import json_serializer, json_deserializer

sm = boto3.client('sagemaker-runtime') 
reko = boto3.client('rekognition')

In [None]:
def get_cluster(features):
    result = sm.invoke_endpoint(
        EndpointName=endpoint_name,
        ContentType='application/json',
        Body=json.dumps({"instances": [ {"features": features.tolist() } ] })
    )
    body = json.loads(result['Body'].read())
    body = body['predictions'][0]
    return int(body['closest_cluster']), body['distance_to_cluster']

In [None]:
def get_image_base64(filename):
    image = Image.open( filename )
    image = image.resize((224, 224))
    image = image.convert("RGBA")
    newData = []
    for item in image.getdata():
        if item[0] >= 253 and item[1] >= 253 and item[2] >= 253:
            newData.append((255, 255, 255, 0))
        else:
            newData.append(item)

    image.putdata(newData)
    
    buffered = BytesIO()
    image.save(buffered, format="PNG")
    return base64.b64encode(buffered.getvalue()).decode('ascii')
    

In [None]:
def load_cluster(cluster_id=None):
    ids_images = []
    clusters = []
    furniture_links = ""
    if cluster_id is None:
        clusters = random.sample(range(0,num_clusters), 6)
        ids_images = [images_inside_clusters[i][0] for i in clusters]   
    else:
        max_elements = min(len(images_inside_clusters[cluster_id]), 10)
        clusters = [cluster_id for i in range(max_elements)]
        random.shuffle(images_inside_clusters[cluster_id])
        ids_images = [i for i in images_inside_clusters[cluster_id][0:max_elements]]
        furniture_links = '<li><a onclick="loadCluster()" href="#"><img width="10%" height="10%" src="https://spock.cloud/ai-workshop/misc/images/return.png"></img>RETURN</a></li>'
    
    for c,i in zip(clusters, ids_images):
        furniture_links += '<li><a onclick="loadCluster({0})" href="#">'.format(c)
        furniture_links += '<img width="30%" height="30%" src="data:image/png;base64,{0}"></img>'.format(get_image_base64(os.path.join( base_dir, 'furniture', images[i[0]] ) ) )
        furniture_links += 'C. {0} - {1} imgs. - dist. {2:.2f} - tags. {3}</a></li>'.format(c,len(images_inside_clusters[c]), i[1], " ".join(tags[c]))

    return furniture_links

In [None]:
# Reads an image from the disk and coverts it to a bytearray
def load_image(filename):
    with open(filename, "rb") as imageFile:
        f = imageFile.read()
        return bytearray(f)

## Building the UI

### First, let's classify all the images and see in which clusters they belong to

In [None]:
%%time

images_inside_clusters = {}
for img_id,i in enumerate(train_set):
    cluster_id, distance = get_cluster(i)
    
    if images_inside_clusters.get(cluster_id) is None:
        images_inside_clusters[cluster_id] = []
    images_inside_clusters[cluster_id].append( [img_id, distance ] )

In [None]:
stats = pd.DataFrame(columns=['cluster_id', 'image_id', 'image_dist'])
for i in range(len(images_inside_clusters)):
    for j in images_inside_clusters[i]:
        stats = stats.append({
            'cluster_id': i, 
            'image_id': j[0], 
            'image_dist': j[1]
        }, ignore_index=True)
stats.head()

In [None]:
stats.drop('image_id', axis=1).groupby('cluster_id').describe()

## Get the labels from Rekognition

In [None]:
tags = {}
for i,k in enumerate(stats.groupby('cluster_id').min().values):
    tags[i] = []
    filename = dataset[(dataset.id == k[0])]['image_path'].values[0]
    response = reko.detect_labels(
        Image={'Bytes': load_image(os.path.join( base_dir, 'furniture', filename) )},
        MaxLabels=5,
        MinConfidence=75
    )
    for k in response['Labels']:
        tags[i].append(k['Name'])

## Now, render the UI

In [None]:
HTML('''
<script type="text/javascript" src="https://spock.cloud/ai-workshop/misc/js/tagcanvas.min.js"></script>

<div id="myCanvasContainer">
 <canvas width="600" height="600" id="myCanvas">
  <p>Anything in here will be replaced on browsers that support the canvas element</p>
  <ul id='tags'></ul>
 </canvas>
</div>

<script type="text/javascript">

    function handle_output(out) {
        if ( !out.content || !out.content.data || !out.content.data["text/plain"] ) {
            console.log( out )
            return;
        } // if
        
        document.getElementById('tags').innerHTML = out.content.data["text/plain"];
        
        TagCanvas.Start('myCanvas', 'myCanvasContainer', {
            textColour: '#000000',
            outlineColour: '#ff00ff',
            reverse: true,
            depth: 0.8,
            maxSpeed: 0.05,
            imageMode: 'both',
            imagePosition: 'bottom',
            wheelZoom: false
        });

    }

    function loadCluster(id) {
        console.log("ClusterID: " + id)
        id = id == undefined ? "":id;
        var kernel = IPython.notebook.kernel;
        
        var callbacks = { 'iopub' : {'output' : handle_output}};
        
        resp = kernel.execute("load_cluster(" + id + ")", callbacks, {silent:false});
    }
</script>
''')

In [None]:
jso = Javascript("loadCluster();")
display_javascript(jso)

### Delete the endpoint

In [None]:
import sagemaker
sagemaker.Session().delete_endpoint(kmeans_predictor.endpoint)