# Information

Perform unsupervised image clustering using a pre-trained VGG16 model (transfer learning from imagenet weights) to cluster apples vs cabbages

# References

- [Image clustering using Transfer learning](https://towardsdatascience.com/image-clustering-using-transfer-learning-df5862779571)
- [Clustering](https://ml-with-tensorflow.info/2017/03/11/clustering/)
- [Image Clustering Using k-Means](https://towardsdatascience.com/image-clustering-using-k-means-4a78478d2b83)

# Setup

In [1]:
import os
import requests
import pathlib
import PIL
import numpy as np
import tensorflow as tf
import shutil
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from sklearn.cluster import KMeans

In [2]:
# Constants
# Parameters for Deep Learning
IMAGE_HEIGHT=224
IMAGE_WIDTH=224

# Test / Code constants
DATASET_URL='https://github.com/rencete/computer-vision-datasets/raw/master/apple-cabbage/Dataset.7z'
TRAIN_DATASET_BASEPATH='Dataset/train'
TEST_DATASET_BASEPATH='Dataset/test'
CLASSES=['apple', 'cabbage']
CLUSTERED_OUTPUT_DIR_BASEPATH='Output'
CLUSTERED_OUTPUT_ARCHIVE_BASEFILENAME='clustered_outputs'

# Download and Extract Images for clustering

In [3]:
# Retrieve the filename of the dataset from the URL
dataset_filename = DATASET_URL.split('/')[-1]

# Download the dataset from Github and save in current directory
if not os.path.isdir(dataset_filename):
  with open(dataset_filename,'wb') as f:
    f.write(requests.get(DATASET_URL).content)

In [4]:
# Check if the dataset has been uncompressed by checking if directory exists
if not os.path.isdir(TRAIN_DATASET_BASEPATH):
  !7z x Dataset.7z

# Functions

In [5]:
'''
Load the images from the files and store as a dataset.
Also generates the labes for the dataset as: 0 - apple, 1 - cabbage.
'''
def load_ds_from_path(base_path: str, augment: bool = True):
  X = None

  # Iterate through the different classes
  for i, c in enumerate(CLASSES):
    path = pathlib.Path(f'{base_path}/{c}')

    # Load images from files
    ds = np.array([img_to_array(load_img(p, color_mode='rgb', target_size=(IMAGE_HEIGHT, IMAGE_WIDTH))) for p in path.glob('*.jpg')])

    # Concatenate / merge datasets for output
    if X is None: 
      # initial case, no data to concatenate yet
      X = ds
    else:
      # concatenate with previously loaded/augmented data
      X = np.concatenate((X, ds))

  return X

In [6]:
def get_images_for_clustering():
  # Get images from both train and test folders
  train_ds = load_ds_from_path(TRAIN_DATASET_BASEPATH)
  test_ds = load_ds_from_path(TEST_DATASET_BASEPATH)

  # Merge both train and test into 1 massive batch for clustering
  ds = np.concatenate((train_ds, test_ds))

  # Shuffle order
  X = tf.random.shuffle(ds)

  return X

In [7]:
def get_features_from_vgg16(images):
  # Prepare pre-trained VGG16 model
  model = VGG16(include_top=False, weights='imagenet', pooling='avg')
  # Model layer weights should not be trainable
  for layer in model.layers:
    layer.trainable = False

  # Preprocess images
  X = preprocess_input(images)

  return model.predict(images)  

In [8]:
def cluster_features(features):
  # Use K-Means clustering to cluster the image features
  kmeans = KMeans(n_clusters=2).fit(features)
  return kmeans.labels_

In [9]:
def save_clustered_images(images, labels):
  # Create directories if it does not yet exist
  if not os.path.exists(CLUSTERED_OUTPUT_DIR_BASEPATH):
    os.makedirs(CLUSTERED_OUTPUT_DIR_BASEPATH)
  for c in ['0', '1']:
    if not os.path.exists(os.path.join(CLUSTERED_OUTPUT_DIR_BASEPATH, c)):
      os.makedirs(os.path.join(CLUSTERED_OUTPUT_DIR_BASEPATH, c))

  # Delete existing files (in case ran multiple times)
  for root, dirs, files in os.walk(CLUSTERED_OUTPUT_DIR_BASEPATH):
    for file in files:
        os.remove(os.path.join(root, file))
  
  # Save clustered images
  for i in range(images.shape[0]):
    img = tf.keras.utils.array_to_img(images[i])
    img.save(os.path.join(CLUSTERED_OUTPUT_DIR_BASEPATH, str(labels[i]), f'{str(i)}.jpg'))

  # Zip directory for downloading
  shutil.make_archive(CLUSTERED_OUTPUT_ARCHIVE_BASEFILENAME, 'zip', CLUSTERED_OUTPUT_DIR_BASEPATH)

# Actual Code Run

In [10]:
images = get_images_for_clustering()

In [11]:
features = get_features_from_vgg16(images)

In [12]:
labels = cluster_features(features)

In [13]:
save_clustered_images(images, labels)