In [None]:
!pip install --upgrade diffusers transformers scipy
!pip install accelerate
!pip install pillow
!pip install sentence_transformers

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` now requires a token generated from https://huggingface.co/settings/tokens .
    
Token: 
Add token as git credential? (Y/n) n
Token is valid.
Your token has been saved to /root/.huggingface/token
Login successful


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import torch
from torch import autocast
from diffusers import StableDiffusionPipeline
import tensorflow as tf
from PIL import Image
import numpy as np

In [None]:
VGG16 = tf.keras.applications.VGG16(include_top=True, weights='imagenet')
x = VGG16.layers[-2].output
VGG16_MODEL = tf.keras.Model(inputs = VGG16.input, outputs = x)
VGG16_MODEL.trainable = False
VGG16_MODEL.summary()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels.h5
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     14758

# Load the images

In [47]:
from PIL import Image
import glob
same_filenames = []
same_images_list = []
for filename in glob.glob("/content/drive/MyDrive/523/dalle2/same/*.png"):
  same_filenames.append(filename)
same_filenames = sorted(same_filenames)
for filename in same_filenames:
  im=Image.open(filename)
  same_images_list.append(im)

mod_filenames = []
mod_images_list = []
for filename in glob.glob("/content/drive/MyDrive/523/dalle2/modified/*.png"):
  mod_filenames.append(filename)
mod_filenames = sorted(mod_filenames)
for filename in mod_filenames:
  im=Image.open(filename)
  mod_images_list.append(im)
  
import copy, random
diff_images_list = copy.deepcopy(mod_images_list)
random.shuffle(diff_images_list)

In [48]:
same_images_list = same_images_list[:200]
mod_images_list = mod_images_list[:200]
diff_images_list = diff_images_list[:200]

In [49]:
len(mod_images_list)

200

# Similarity metrics

## L2 Norm

In [51]:
# Helper functions
def image_to_feature_vector(img):
    return np.array(tf.squeeze(VGG16_MODEL(tf.expand_dims(img, 0))))

from numpy.linalg import norm
def similarity(v1, v2):
    v1 /= norm(v1)
    v2 /= norm(v2)
    return 1. - norm(v2 - v1)

In [None]:
def get_similarity(img1, img2):
  mean = np.array([0.485, 0.456, 0.406])
  std = np.array([0.229, 0.224, 0.225])
  try:
    image1 = tf.image.resize_with_pad(img1, 224, 224)
    # Normalize
    image1 = tf.math.divide(image1, 255.)
    image1 = tf.math.subtract(image1, mean)
    image1 = tf.math.divide(image1, std)
    image1 = image_to_feature_vector(image1)

    image2 = tf.image.resize_with_pad(img2, 224, 224)
    # Normalize
    image2 = tf.math.divide(image2, 255.)
    image2 = tf.math.subtract(image2, mean)
    image2 = tf.math.divide(image2, std)
    image2 = image_to_feature_vector(image2)

    sim = similarity(image1, image2)
  except:
    return 0
  return sim

## Cosine similarity

In [None]:
from scipy.spatial.distance import cosine
def get_cosine_sim(img1, img2):
  try:
    img1 = image_to_feature_vector(img1)
    img2 = image_to_feature_vector(img2)
    cos_sim = 1 - cosine(img1, img2)
  except:
    return 0
  return cos_sim

## Inception score

In [53]:
# from tf.keras.applications.inception_v3 import InceptionV3
# from tf.keras.applications.inception_v3 import preprocess_input
from math import floor
from numpy import ones
from numpy import expand_dims
from numpy import log
from numpy import mean
from numpy import std
from numpy import exp
# load inception v3 model
inception_model = tf.keras.applications.inception_v3.InceptionV3()
def calculate_inception_score(images, eps=1E-16):
  try:
    images = np.array(images)
    processed = images.astype('float32')
    # pre-process raw images for inception v3 model
    processed = tf.keras.applications.inception_v3.preprocess_input(processed)
    # predict class probabilities for images
    yhat = inception_model.predict(processed, verbose=False)
    # enumerate splits of images/predictions
    p_yx = yhat
    # calculate p(y)
    p_y = expand_dims(p_yx.mean(axis=0), 0)
    # calculate KL divergence using log probabilities
    kl_d = p_yx * (log(p_yx + eps) - log(p_y + eps))
    # sum over classes
    sum_kl_d = kl_d.sum(axis=1)
    # average over images
    avg_kl_d = mean(sum_kl_d)
    # undo the log
    is_score = exp(avg_kl_d)
  except:
    return 0
  return is_score

In [54]:
from tensorflow.keras.utils import img_to_array
def get_average_is(images):
  inception_scores = []
  for i in range(0, len(images), 2):
    img1 = copy.deepcopy(images[i])
    img2 = copy.deepcopy(images[i+1])
    size = 299, 299
    img1.thumbnail(size)
    img2.thumbnail(size)
    np_img1 = img_to_array(img1)
    np_img2 = img_to_array(img2)
    is_score = calculate_inception_score([np_img1, np_img2])
    if is_score == 0:
      continue
    inception_scores.append(is_score)
  return np.mean(np.array(inception_scores)), np.var(np.array(inception_scores))

In [55]:
is_same_mean, is_same_var = get_average_is(same_images_list)

In [56]:
print("Mean inception score for same prompt pairs:", is_same_mean)
print("Variance of the inception scores for same prompt pairs:", is_same_var)

Mean inception score for same prompt pairs: 1.4723375
Variance of the inception scores for same prompt pairs: 0.05360793


In [57]:
is_mod_mean, is_mod_var = get_average_is(mod_images_list)

  images = np.array(images)


In [58]:
print("Mean inception score for prompt pairs with small modifications:", is_mod_mean)
print("Variance of the inception scores for prompt pairs with small modifications:", is_mod_var)

Mean inception score for prompt pairs with small modifications: 1.4700022
Variance of the inception scores for prompt pairs with small modifications: 0.053097744


In [59]:
is_diff_mean, is_diff_var= get_average_is(diff_images_list)

  images = np.array(images)


In [60]:
print("Mean inception score for different prompt pairs:", is_diff_mean)
print("Variance of the inception scores for different prompt pairs:", is_diff_var)

Mean inception score for different prompt pairs: 1.6778843
Variance of the inception scores for different prompt pairs: 0.03162907


# VGG16 Feature vector

In [61]:
import copy
from tqdm import tqdm
def vgg16_similarity(images):
  l2_sim_scores = []
  cosine_sim_scores = []
  fad_scores = []
  for i in tqdm(range(0, len(images), 2)):
    img1 = copy.deepcopy(images[i])
    img2 = copy.deepcopy(images[i+1])
    size = 224, 224
    img1.thumbnail(size)
    img2.thumbnail(size)
    np_img1 = img_to_array(img1)
    np_img2 = img_to_array(img2)
    # L2 Norm
    l2_sim = get_similarity(np_img1, np_img2)
    # Cosine similarity
    cosine_sim = get_cosine_sim(np_img1, np_img2)
    if l2_sim != 0 and cosine_sim != 0:
      l2_sim_scores.append(l2_sim)
      cosine_sim_scores.append(cosine_sim)

  return np.mean(np.array(l2_sim_scores)), np.var(np.array(l2_sim_scores)), np.mean(np.array(cosine_sim_scores)), np.var(np.array(cosine_sim_scores))

In [62]:
vgg_l2_same_mean, vgg_l2_same_var, vgg_cosine_same_mean, vgg_cosine_same_var = vgg16_similarity(same_images_list)

100%|██████████| 100/100 [00:07<00:00, 13.08it/s]


In [63]:
print("Mean L2 Norm similarity score for same prompt pairs:", vgg_l2_same_mean)
print("Variance of the L2 Norm similarity scores for same prompt pairs:", vgg_l2_same_var)
print("Mean cosine similarity score for same prompt pairs:", vgg_cosine_same_mean)
print("Variance of the cosine similarity scores for same prompt pairs:", vgg_cosine_same_var)

Mean L2 Norm similarity score for same prompt pairs: 0.6719781000912189
Variance of the L2 Norm similarity scores for same prompt pairs: 0.004714547806344767
Mean cosine similarity score for same prompt pairs: 0.4714433901011944
Variance of the cosine similarity scores for same prompt pairs: 0.019396120419796544


In [64]:
vgg_l2_mod_mean, vgg_l2_mod_var, vgg_cosine_mod_mean, vgg_cosine_mod_var = vgg16_similarity(mod_images_list)

100%|██████████| 100/100 [00:06<00:00, 14.45it/s]


In [65]:
print("Mean L2 Norm similarity score for prompt pairs with small modifications:", vgg_l2_mod_mean)
print("Variance of the L2 Norm similarity scores for prompt pairs with small modifications:", vgg_l2_mod_var)
print("Mean cosine similarity score for prompt pairs with small modifications:", vgg_cosine_mod_mean)
print("Variance of the cosine similarity scores for prompt pairs with small modifications:", vgg_cosine_mod_var)

Mean L2 Norm similarity score for prompt pairs with small modifications: 0.6779735636229467
Variance of the L2 Norm similarity scores for prompt pairs with small modifications: 0.0022308997715458478
Mean cosine similarity score for prompt pairs with small modifications: 0.48161510701733407
Variance of the cosine similarity scores for prompt pairs with small modifications: 0.009824593542931425


In [66]:
vgg_l2_diff_mean, vgg_l2_diff_var, vgg_cosine_diff_mean, vgg_cosine_diff_var = vgg16_similarity(diff_images_list)

100%|██████████| 100/100 [00:06<00:00, 14.45it/s]


In [67]:
print("Mean L2 Norm similarity score for different prompt pairs:", vgg_l2_diff_mean)
print("Variance of the L2 Norm similarity scores for different prompt pairss:", vgg_l2_diff_var)
print("Mean cosine similarity score for different prompt pairs:", vgg_cosine_diff_mean)
print("Variance of the cosine similarity scores for different prompt pairs:", vgg_cosine_diff_var)

Mean L2 Norm similarity score for different prompt pairs: 0.5751468084677301
Variance of the L2 Norm similarity scores for different prompt pairss: 0.003407199132593238
Mean cosine similarity score for different prompt pairs: 0.2900132770189131
Variance of the cosine similarity scores for different prompt pairs: 0.0088884593958427


# Using CLIP Model's dense representations 

In [None]:
from sentence_transformers import SentenceTransformer, util
from PIL import Image
import glob
import os

# Load the OpenAI CLIP Model
print('Loading CLIP Model...')
model = SentenceTransformer('clip-ViT-B-32')

Loading CLIP Model...


Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.03k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/525k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/316 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/605M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/389 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/604 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/961k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.88k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

In [None]:
def get_clip_encoding(img1, img2):
  encoded_image = model.encode([img1, img2], batch_size=2, convert_to_tensor=True)
  return encoded_image

In [68]:
import copy
from tqdm import tqdm
from scipy.spatial import distance
def clip_similarity(images):
  l2_sim_scores = []
  cosine_sim_scores = []
  inception_scores = []
  fad_scores = []
  for i in tqdm(range(0, len(images), 2)):
    img1 = copy.deepcopy(images[i])
    img2 = copy.deepcopy(images[i+1])
    encoded_image = get_clip_encoding(img1, img2)
    l2_sim = similarity(encoded_image[0].cpu(), encoded_image[1].cpu())
    cosine_sim = 1 - cosine(encoded_image[0].cpu(), encoded_image[1].cpu())
    # print(l2_sim, cosine_sim)
    if l2_sim != 0 and cosine_sim != 0:
      l2_sim_scores.append(l2_sim)
      cosine_sim_scores.append(cosine_sim)
  return np.mean(np.array(l2_sim_scores)), np.var(np.array(l2_sim_scores)), np.mean(np.array(cosine_sim_scores)), np.var(np.array(cosine_sim_scores))

In [69]:
clip_same_mean, clip_same_var, clip_cosine_same_mean, clip_cosine_same_var = clip_similarity(same_images_list)

100%|██████████| 100/100 [00:05<00:00, 17.35it/s]


In [70]:
print("Mean L2 Norm similarity score for same prompt pairs:", clip_same_mean)
print("Variance of the L2 Norm similarity scores for same prompt pairs:", clip_same_var)
print("Mean cosine similarity score for same prompt pairs:", clip_cosine_same_mean)
print("Variance of the cosine similarity scores for same prompt pairs:", clip_cosine_same_var)

Mean L2 Norm similarity score for same prompt pairs: 0.3510187005996704
Variance of the L2 Norm similarity scores for same prompt pairs: 0.03017722534591839
Mean cosine similarity score for same prompt pairs: 0.7743230333924294
Variance of the cosine similarity scores for same prompt pairs: 0.014718721778409988


In [71]:
clip_mod_mean, clip_mod_var, clip_cosine_mod_mean, clip_cosine_mod_var = clip_similarity(mod_images_list)

100%|██████████| 100/100 [00:05<00:00, 17.57it/s]


In [72]:
print("Mean L2 Norm similarity score for prompt pairs with small modifications:", clip_mod_mean)
print("Variance of the L2 Norm similarity scores for prompt pairs with small modifications:", clip_mod_var)
print("Mean cosine similarity score for prompt pairs with small modifications:", clip_cosine_mod_mean)
print("Variance of the cosine similarity scores for prompt pairs with small modifications:", clip_cosine_mod_var)

Mean L2 Norm similarity score for prompt pairs with small modifications: 0.4015511813759804
Variance of the L2 Norm similarity scores for prompt pairs with small modifications: 0.013363447495277198
Mean cosine similarity score for prompt pairs with small modifications: 0.8142477709054947
Variance of the cosine similarity scores for prompt pairs with small modifications: 0.00544237714343878


In [73]:
clip_diff_mean, clip_diff_var, clip_cosine_diff_mean, clip_cosine_diff_var = clip_similarity(diff_images_list)

100%|██████████| 100/100 [00:05<00:00, 17.38it/s]


In [74]:
print("Mean L2 Norm similarity score for different prompt pairs:", clip_diff_mean)
print("Variance of the L2 Norm similarity scores for different prompt pairss:", clip_diff_var)
print("Mean cosine similarity score for different prompt pairs:", clip_cosine_diff_mean)
print("Variance of the cosine similarity scores for different prompt pairs:", clip_cosine_diff_var)

Mean L2 Norm similarity score for different prompt pairs: 0.08022803246974945
Variance of the L2 Norm similarity scores for different prompt pairss: 0.007533375616255673
Mean cosine similarity score for different prompt pairs: 0.5732430723309517
Variance of the cosine similarity scores for different prompt pairs: 0.006119840114109992
