# Generate similarity strips

This notebook loads in VGG classifier fc2 features, runs these through PCA, then uses this data to find the most similar images in the dataset. 

In [None]:
import sqlite3
import random
import itertools
import subprocess
import os
import shlex
import time
import pickle

In [None]:
import bz2

In [None]:
from PIL import Image, ImageDraw, ImageFont

# from pillow import Image

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import matplotlib.cm as cm
from matplotlib.colors import Normalize

In [None]:
from keras.preprocessing import image

In [None]:
from scipy.spatial import distance

In [None]:
def load_image(path):
    img = image.load_img(path, target_size=model.input_shape[1:3])
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    return img, x

In [None]:
p = "/home/rte/re-imaging/visualisation/features/v1/features_0_50000_vgg.pkl.pbz2"
folder = "/home/rte/re-imaging/visualisation/features/v1/"

features = []
files = [f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))]
files.sort()
# print(files)

for f in files[:2]:
    with bz2.open(folder + f, "r") as read_file:
        features = features + pickle.load(read_file)
        print("loaded:", folder + f)
        print("features size:",len(features))
#     for p in features[:5]:
#         print(p)

In [None]:
print(len(features), len(features[0]))

In [None]:
plt.figure(figsize=(16,4))
for p in features[:10]:
    plt.plot(p)

In [None]:
features = np.array(features)

In [None]:
some_features = features[:100000]
print(len(some_features))

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=300)
pca.fit(some_features)

In [None]:
pca_features = pca.transform(some_features)

In [None]:
filepaths = []

with open("/home/rte/data/paths/all_converted_jpg_images_shuf.txt", "r") as f:
    lines = f.readlines()
    print("length:",len(lines))
    print(lines[0])
for l in lines:
    # substrings = l.rsplit(",", 1)
    filepaths.append(l.strip())
    # image_ids.append(substrings[1].strip())

In [None]:
# get the image paths for all images that we have features for
images = []
image_folder = "/mnt/hd2/images/all/"

for i, f in enumerate(filepaths[:len(features)]):
    images.append(image_folder + f)
print(len(images))
print(images[:10])

In [None]:
# grab a random query image
query_image_idx = int(len(images) * random.random())

# let's display the image
img = image.load_img(images[query_image_idx])
plt.imshow(img)

In [None]:
# get distance from PCA
def get_closest_images(query_image_idx, num_results=5):
    distances = [ distance.cityblock(pca_features[query_image_idx], feat) for feat in pca_features ]
    idx_closest = sorted(range(len(distances)), key=lambda k: distances[k])[1:num_results+1]
    return idx_closest

In [None]:
# get distance between features
def get_closest_images(query_image_idx, num_results=5):
    distances = [ distance.cosine(features[query_image_idx], feat) for feat in features ]
    idx_closest = sorted(range(len(distances)), key=lambda k: distances[k])[1:num_results+1]
    return idx_closest

### grid version

In [None]:
target = "similarity_strips_1m_features"
if not os.path.exists(target): os.mkdir(target)
    
xdim = 5
ydim = 5

for index, image_path in enumerate(images[:20]):
    print("path:",image_path)
    start = time.time()
    plt.close('all')

    # get similarity for current image
    query_image_idx = index
    idx_closest = get_closest_images(query_image_idx, num_results=20)
    query_image = mpimg.imread(image_path)

    fig, ax = plt.subplots(xdim, ydim)
    fig.set_size_inches(10, 12)
    fig.patch.set_facecolor('0.98')
#     print(filepaths[index].split(".")[0])
    title_string = "similarity strip {:04d} | image ID: {:08d}".format(index, int(filepaths[index].split(".")[0]))
    print(title_string)
    fig.suptitle(title_string, y=1.0)

    # display the query image
    ax[0, 0].imshow(query_image, cmap='Greys_r')
#     ax[0, 0].set_title("query image ID: " + filepaths[index].split(".")[0], pad=20)
    
    for i in range(5):
        ax[0, i]. axis('off')
    
    for y in range(1, ydim):
        for x in range(xdim):
#             print(x, y)
            grid_index = (y * ydim) + x - xdim
#             print(grid_index)
#             print(idx_closest[grid_index])
#             print(images[idx_closest[grid_index]])
            current_image = mpimg.imread(images[idx_closest[grid_index]])
            ax[y, x].imshow(current_image, cmap='Greys_r')
            ax[y, x].axis('off')

    plt.tight_layout(pad=0.5)
    savename = target + "/" + f'{query_image_idx:04}' + "_" + filepaths[index].split(".")[0] + "_strip_1m.jpg"
    plt.savefig(savename, dpi=150, bbox_inches='tight')
    print("completed", index)
    print("time taken", "{:.4f}".format(time.time() - start))

In [None]:
print(images[:5])

### line version

In [None]:
def get_concatenated_images(indexes, thumb_height):
    thumbs = []
    for idx in indexes:
        img = image.load_img(images[idx])
        img = img.resize((int(img.width * thumb_height / img.height), thumb_height))
        thumbs.append(img)
    concat_image = np.concatenate([np.asarray(t) for t in thumbs], axis=1)
    return concat_image

In [None]:
target = "similarity_strips_100k"
if not os.path.exists(target): os.mkdir(target)
    
for index, image_name in enumerate(images[:20]):
    plt.close('all')
    # do a query on a random image
    query_image_idx = index
    # query_image_idx = int(len(images) * random.random())
    idx_closest = get_closest_images(query_image_idx, num_results=20)
    query_image = get_concatenated_images([query_image_idx], 512)

    fig, ax = plt.subplots(5, 1)
    fig.set_size_inches(10, 10)
    fig.patch.set_facecolor('0.98')

    # display the query image
    # plt.figure(figsize = (5,5))
    ax[0].imshow(query_image)
    ax[0].set_title("query image (%d)" % query_image_idx + " - image ID: " + filepaths[index].split(".")[0])
    ax[0].axis('off')

    # display the resulting images
    for i in range(0, 4):
        ia = i*5
        ib = (i*5)+5
    #     print(ia, ib)
        results_image = get_concatenated_images(idx_closest[ia:ib], 512)
    #     ax[i+1].figure(figsize = (16,12))
        ax[i+1].imshow(results_image)
    #     ax[i].title("result images")
        ax[i+1].axis('off')

    plt.tight_layout(pad=2.5)
    savename = target + "/" + f'{query_image_idx:04}' + "_" + filepaths[index].split(".")[0] + "_strip_100k.jpg"
    plt.savefig(savename, dpi=150, bbox_inches='tight')

In [None]:
# do a query on a random image
query_image_idx = int(len(images) * random.random())
idx_closest = get_closest_images(query_image_idx)
query_image = get_concatenated_images([query_image_idx], 300)
results_image = get_concatenated_images(idx_closest, 200)

# display the query image
plt.figure(figsize = (5,5))
plt.imshow(query_image)
plt.title("query image (%d)" % query_image_idx)

# display the resulting images
plt.figure(figsize = (16,12))
plt.imshow(results_image)
plt.title("result images")

### Min and max feature activation

In [None]:
selection = 2

In [None]:
min_img = image.load_img(images[min_index[selection]])
plt.imshow(min_img)

plt.figure(figsize=(16,4))
max_range = 4096
# for p in features[top_index[0]]:
#     plt.plot(p)
plt.plot(features[min_index[selection]][:max_range])
# plt.xticks(range(max_range), range(max_range))

In [None]:
# selection = 10

for index in top_index[:10]:
#     plt.close('all')

    # display max image and plot
    max_img = image.load_img(images[index])
    plt.imshow(max_img)

    plt.figure(figsize=(16,4))
    max_range = 4096
    # for p in features[top_index[0]]:
    #     plt.plot(p)
    plt.plot(features[index][:max_range])
    # plt.xticks(range(max_range), range(max_range))

#     time.sleep(2)

In [None]:
features.shape

In [None]:
sorted_indexes = np.argsort(features, axis=0)
# sorted_indexes = np.argsort(pca_features, axis=0)

In [None]:
sorted_indexes = np.flip(sorted_indexes, axis=0)
sorted_indexes = sorted_indexes.transpose()

In [None]:
sorted_indexes[0][:20]

In [None]:
sorted_indexes.shape

In [None]:
sorted_indexes

In [None]:
target = "top_feature_activations"
if not os.path.exists(target): os.mkdir(target)
    
xdim = 5
ydim = 5

for i, indexes in enumerate(sorted_indexes[:300]):
#     print("indexes:",indexes)
    start = time.time()
    plt.close('all')

    fig, ax = plt.subplots(xdim, ydim)
    fig.set_size_inches(10, 10)
    #     fig.patch.set_facecolor('0.98')
    title_string = "top activations | feature {:04d}".format(i)
#     print(title_string)
    fig.suptitle(title_string, y=1.05)
    
#     for index in indexes[:xdim*ydim]:
    for y in range(ydim):
        for x in range(xdim):
            index = indexes[y * ydim + x]
#             print(images[index])
            identifier = filepaths[index].split(".")[0] 
#             print(identifier)
#             print(x, y)
#             grid_index = (y * ydim) + x - xdim
# #             print(grid_index)
# #             print(idx_closest[grid_index])
# #             print(images[idx_closest[grid_index]])
            current_image = mpimg.imread(images[index])
            ax[y, x].imshow(current_image, cmap='Greys_r')
            ax[y, x].axis('off')
#             subplot_title = "{:02d}: {:08d}".format(y*ydim+x, int(identifier))
#             ax[y, x].text(0.5, 0, subplot_title, ha="center")

    plt.tight_layout(pad=0.5)
    savename = "{}/{:03d}_activation_100k.jpg".format(target, i)
    print(savename)
    plt.savefig(savename, dpi=150, bbox_inches='tight')
#     print("completed", index)
#     print("time taken", "{:.4f}".format(time.time() - start))

# CUT

In [None]:
# load all the similarity results as thumbnails of height 100
thumbs = []
for idx in idx_closest:
    img = image.load_img(images[idx])
    img = img.resize((int(img.width * 100 / img.height), 100))
    thumbs.append(img)

# concatenate the images into a single image
concat_image = np.concatenate([np.asarray(t) for t in thumbs], axis=1)

# show the image
plt.figure(figsize = (16,12))
plt.imshow(concat_image)
plt.savefig("closest_test", dpi=150, bbox_inches='tight')

In [None]:
similar_idx = [ distance.cosine(pca_features[query_image_idx], feat) for feat in pca_features ]
idx_closest = sorted(range(len(similar_idx)), key=lambda k: similar_idx[k])[1:6]