# **Image Similarity**

Finding similar images

Author: Shreyash Gupta

Organization: IndiaMART InterMESH Pvt. Ltd.

# **Referring to image dataset**

Mounting Google Drive

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Changing directory to data path

In [0]:
import zipfile
import os
os.chdir("/content/drive/My Drive/Colab Notebooks")

# **Building the model**

Importing necessary modules

In [0]:
import keras
from keras.models import Model

Loading the VGG16 model

In [0]:
model = keras.applications.VGG16(weights = "imagenet",
                                 include_top = True)

Creating the model

In [0]:
feat_extractor = Model(inputs = model.input,
                       outputs = model.get_layer("fc2").output)

In [0]:
model.summary()

# Image feature extraction

Importing neccessary modules

In [0]:
from keras.preprocessing import image
import numpy as np
import matplotlib.pyplot as plt
from keras.applications.imagenet_utils import decode_predictions, preprocess_input
from sklearn.decomposition import PCA

Defining function to load image

In [0]:
def load_image(path):
  img = image.load_img(path, target_size = model.input_shape[1:3])
  x = image.img_to_array(img)
  x = np.expand_dims(x, axis=0)
  x = preprocess_input(x)
  return img, x

Extracting images

In [0]:
import os
import random
images_path = 'auto'
image_extensions = ['.jpg', '.png', '.jpeg']
max_num_images = 5000

images = [os.path.join(dp, f) for dp, dn, filenames in os.walk(images_path) for f in filenames if os.path.splitext(f)[1].lower() in image_extensions]
if max_num_images < len(images):
    images = [images[i] for i in sorted(random.sample(range(len(images)), max_num_images))]

print("Keeping %d images to analyze" % len(images))

Extracting features

In [0]:
import time
tic = time.clock()


features = []
for i, image_path in enumerate(images):
    if i % 500 == 0:
        toc = time.clock()
        elap = toc-tic;
        print("Analyzing image %d / %d. Time: %4.4f seconds." % (i, len(images),elap))
        tic = time.clock()
    img, x = load_image(image_path);
    feat = feat_extractor.predict(x)[0]
    features.append(feat)

print('Finished extracting features for %d images' % len(images))

Reducing to principal components

In [0]:
features = np.array(features)
pca = PCA(n_components=300)
pca.fit(features)

In [0]:
pca_features = pca.transform(features)

# Finding similar images

Importing necessary modules

In [0]:
from scipy.spatial import distance

Defining function to find similar images

In [0]:
def get_closest_images(query_image_idx, num_results=5):
    distances = [ distance.cosine(pca_features[query_image_idx], feat) for feat in pca_features ]
    idx_closest = sorted(range(len(distances)), key=lambda k: distances[k])[1:num_results+1]
    return idx_closest

Defining function to display 5 most similar images

In [0]:

def get_concatenated_images(indexes, thumb_height):
    thumbs = []
    for idx in indexes:
        img = image.load_img(images[idx])
        img = img.resize((int(img.width * thumb_height / img.height), thumb_height))
        thumbs.append(img)
    concat_image = np.concatenate([np.asarray(t) for t in thumbs], axis=1)
    return concat_image


Querying on a random image

In [0]:
query_image_idx = int(len(images) * random.random())
idx_closest = get_closest_images(query_image_idx)
query_image = get_concatenated_images([query_image_idx], 300)
results_image = get_concatenated_images(idx_closest, 200)

plt.figure(figsize = (5,5))
plt.imshow(query_image)
plt.title("query image (%d)" % query_image_idx)

plt.figure(figsize = (16,12))
plt.imshow(results_image)
plt.title("result images")

# Storing results

Importing necessary modules

In [0]:
import pandas as pd


Creating result data frame

In [0]:
df = pd.DataFrame(columns = ["Query Image","Closest Match", "Match 2", "Match 3", "Match 4", "Match 5"])

Storing results to data frame

In [0]:
df = df.append({"Query Image" : images[query_image_idx],
                "Closest Match" : images[idx_closest[0]],
                "Match 2" : images[idx_closest[1]],
                "Match 3" : images[idx_closest[2]],
                "Match 4" : images[idx_closest[3]],
                "Match 5" : images[idx_closest[4]]},
               ignore_index = True)

# Prediction

Loading image and extracting features

In [0]:
new_image, x = load_image("../image.jpg")
new_features = feat_extractor.predict(x)

PCA transformation

In [0]:
new_pca_features = pca.transform(new_features)[0]

Getting result

In [0]:
distances = [ distance.cosine(new_pca_features, feat) for feat in pca_features ]
idx_closest = sorted(range(len(distances)), key=lambda k: distances[k])[0:5]  # grab first 5
results_image = get_concatenated_images(idx_closest, 200)

plt.figure(figsize = (5,5))
plt.imshow(new_image)
plt.title("query image")

plt.figure(figsize = (16,12))
plt.imshow(results_image)
plt.title("result images")