In [1]:
import torch
import clip
from PIL import Image
import pandas as pd

In [2]:
prompts = [
    # Fantasy
    "A majestic griffin soaring through a vibrant nebula, its feathers shimmering with starlight.",
    "A hidden dwarven city carved into a colossal mountain, bathed in the warm glow of lava.",
    "A fierce battle between a band of orcs and a group of valiant elves in an enchanted forest.",
    "A lone mermaid swimming through a bioluminescent coral reef, surrounded by colorful fish.",
    "A towering wizard casting a powerful spell, runes swirling around them in a mystical chamber.",

    # Sci-Fi
    "A sleek spaceship exploring a ringed gas giant, with a breathtaking view of the aurora borealis.",
    "A team of astronauts on a desolate alien planet, discovering a cryptic message carved into a rocky surface.",
    "A bustling cyberpunk marketplace filled with neon signs, holograms, and robotic vendors.",
    "A lone cyborg warrior standing amidst the ruins of a futuristic city, bathed in moonlight.",
    "A breathtaking view of a space colony orbiting a lush green planet, with a shimmering space station in the foreground.",

    # Nature
    "A majestic waterfall cascading down a mossy cliff face, surrounded by lush green ferns and vibrant wildflowers.",
    "A majestic lion stalking its prey through a golden savanna at sunset, with a vibrant orange sky.",
    "A lone wolf howling at the full moon, silhouetted against a snow-capped mountain range.",
    "A vibrant coral reef teeming with colorful fish, sunlight filtering through the crystal-clear water.",
    "A breathtaking panorama of a misty mountain valley with a winding river and a charming wooden bridge.",

    # Mythology
    "A fierce battle between Zeus and Poseidon, wielding lightning and a trident amidst swirling clouds.",
    "A beautiful phoenix rising from the ashes, its wings spread wide and bathed in fiery light.",
    "A wise centaur teaching a young human warrior the art of archery in a sun-dappled forest.",
    "A mischievous group of satyrs dancing with playful nymphs in a moonlit forest, surrounded by glowing mushrooms.",
    "A majestic griffin pulling the chariot of the Greek goddess Athena, soaring across a clear blue sky.",

    # History
    "A detailed portrait of a young Cleopatra adorned with gold jewelry, gazing confidently out at the viewer.",
    "A bustling marketplace in medieval Europe, with merchants selling various goods and people dressed in period clothing.",
    "A dramatic scene of the signing of the Declaration of Independence, with historical figures like John Adams and Benjamin Franklin.",
    "A pharaoh's tomb filled with golden treasures and hieroglyphics, illuminated by flickering torches.",
    "A fierce battle scene between Roman legionaries and barbarian warriors, with swords clashing and smoke filling the air.",

    # Portrait
    "A close-up portrait of an elderly Asian woman with kind eyes and a gentle smile, wrinkles etched onto her face.",
    "A young African-American man with a vibrant afro hairstyle, wearing a stylish outfit and exuding confidence.",
    "A woman with flowing red hair and freckles, wearing a flowing green dress and gazing wistfully into the distance.",
    "A man with a long, braided beard and piercing blue eyes, wearing a Viking helmet and leather armor.",
    "A child with bright green eyes and a mischievous grin, playfully peeking out from behind a colorful curtain.",

    # Surreal
    "A melting clock dripping down a cobblestone street in a dreamlike cityscape with impossible architecture.",
    "A chessboard where the pieces are giant desserts like cupcakes and castles made of ice cream.",
    "A woman with a book for a head, leaves turning its pages as she walks through a magical forest.",
    "A staircase that spirals endlessly upwards into a swirling vortex of clouds at the top.",
    "A room filled with doors leading to unexpected locations, like a bustling underwater market or a field of giant flowers.",

    # Horror
    "A dark and decaying mansion shrouded in mist, with a single glowing window revealing a shadowy figure.",
    "A group of terrified teenagers running through a dark forest, chased by a monstrous creature with glowing eyes.",
    "A close-up of a decaying zombie face, with rotting flesh and bloodshot eyes staring out.",
    "A haunted graveyard at night, with tombstones casting long shadows and a ghostly figure floating through the air.",
    "A creepy children's nursery with broken toys, rocking chairs swaying on their own, and an unsettling silence.",

    # Food
    "A slice of a freshly baked chocolate cake with rich frosting and colorful sprinkles, drizzled with melted chocolate.",
    "A steaming bowl of Japanese ramen with perfectly cooked noodles, meat slices, a soft-boiled egg, and a flavorful broth.",
    "A juicy burger with melted cheese, crisp lettuce, and a perfectly toasted bun, served with golden french",
    "A rustic wooden table overflowing with a colorful charcuterie board, featuring cured meats, cheeses, fresh grapes, and crusty bread.",
    "A steaming cup of creamy latte art, with intricate designs like a swan or a heart floating on top.",
    
    #Abstract Art
    "A vibrant composition of geometric shapes in contrasting colors, creating a sense of movement and dynamism.",
    "A swirling mass of textured brushstrokes in calming earth tones, evoking a feeling of serenity and peace.",
    "A canvas filled with splatters and drips of metallic paint, capturing the raw energy and chaos of the creative process.",
    "A minimalist composition featuring a single line or dot against a vast expanse of white space, exploring the beauty of simplicity.",
    "A gradient blend of vibrant colors, transitioning seamlessly from one hue to another, creating a sense of depth and mystery."
]

# Forms vote information

In [3]:
l=[i for i in range(50)]

# CLIP Scores

In [4]:
def get_clip_score(image_path, text):
    # Load the pre-trained CLIP model and the image
    model, preprocess = clip.load('ViT-B/32')
    image = Image.open(image_path)

    # Preprocess the image and tokenize the text
    image_input = preprocess(image).unsqueeze(0)
    text_input = clip.tokenize([text])
    
    # Move the inputs to GPU if available
    device = "cuda" if torch.cuda.is_available() else "cpu"
    image_input = image_input.to(device)
    text_input = text_input.to(device)
    model = model.to(device)
    
    # Generate embeddings for the image and text
    with torch.no_grad():
        image_features = model.encode_image(image_input)
        text_features = model.encode_text(text_input)
    
    # Normalize the features
    image_features = image_features / image_features.norm(dim=-1, keepdim=True)
    text_features = text_features / text_features.norm(dim=-1, keepdim=True)
    
    # Calculate the cosine similarity to get the CLIP score
    clip_score = torch.matmul(image_features, text_features.T).item()
    
    return clip_score

In [5]:
df=pd.DataFrame({"prompts":prompts})
dpo_scores=[]
for i in range(len(prompts)):
    image_path = "DPO_imgs\img"+str(i)+".png"
    text = prompts[i]
    dpo_scores.append(get_clip_score(image_path, text))
df["clip_dpo"]=dpo_scores

rlhf_scores=[]
for i in range(len(prompts)):
    image_path = "RLHF_imgs\img"+str(i)+".png"
    text = prompts[i]
    rlhf_scores.append(get_clip_score(image_path, text))
df["clip_rlhf"]=rlhf_scores

df["dpo_better"]=df["clip_dpo"]>df["clip_rlhf"]

In [6]:
#According to CLIP dpo was better than RLHF 37/50 times.
print("DPO was better : ",(df[df["dpo_better"]==True].shape[0]/df.shape[0])*100, "%\nRLHF was better : ",(df[df["dpo_better"]==False].shape[0]/df.shape[0])*100,"%")

DPO was better :  74.0 %
RLHF was better :  26.0 %


Upon trying with the cosine simliarity method instead of matmul of pytorch we found the same answer.

# Frechet Inception Distance (FID)

In [7]:
import numpy as np
from numpy import cov
from numpy import trace
from numpy import iscomplexobj
from numpy import asarray
from numpy.random import shuffle
from scipy.linalg import sqrtm
from keras.applications.inception_v3 import InceptionV3
from keras.applications.inception_v3 import preprocess_input
from skimage.transform import resize

In [8]:
def calculate_fid(model, images1, images2):
    act1 = model.predict(images1)
    act2 = model.predict(images2)
    mu1, sigma1 = act1.mean(axis=0), cov(act1, rowvar=False)
    mu2, sigma2 = act2.mean(axis=0), cov(act2, rowvar=False)
    ssdiff = np.sum((mu1 - mu2)**2.0)
    covmean = sqrtm(sigma1.dot(sigma2))
    if iscomplexobj(covmean):
        covmean = covmean.real
    fid = ssdiff + trace(sigma1 + sigma2 - 2.0 * covmean)
    return fid

In [9]:
def scale_images(images, new_shape):
    images_list = list()
    for image in images:
        new_image = resize(image, new_shape, 0)
        images_list.append(new_image)
    return asarray(images_list)

In [10]:
model = InceptionV3(include_top=False, pooling='avg', input_shape=(299,299,3))

In [11]:
images1=[]
images2=[]
images3=[]
for i in range(0, len(prompts)):
    images1.append(np.array(Image.open("DPO_imgs\img"+str(i)+".png")))
    images2.append(np.array(Image.open("RLHF_imgs\img"+str(i)+".png")))
    images3.append(np.array(Image.open("SDXL_imgs\img"+str(i)+".png")))
images1=np.asarray(images1)
images2=np.asarray(images2)
images3=np.asarray(images3)
images1 = images1.astype('float32')
images2 = images2.astype('float32')
images3 = images3.astype('float32')

print('Loaded', images1.shape, images2.shape, images3.shape)


Loaded (50, 512, 512, 3) (50, 512, 512, 3) (50, 1024, 1024, 3)


In [12]:
images1 = scale_images(images1, (299,299,3))
images2 = scale_images(images2, (299,299,3))
images3 = scale_images(images3, (299,299,3))
print('Scaled', images1.shape, images2.shape,images3.shape)

Scaled (50, 299, 299, 3) (50, 299, 299, 3) (50, 299, 299, 3)


In [13]:
images1 = preprocess_input(images1)
images2 = preprocess_input(images2)
images3 = preprocess_input(images3)
fid = calculate_fid(model, images1, images3)
print('FID DPO vs SDXL: %.3f' % fid)
fid = calculate_fid(model, images2, images3)
print('FID RLHF vs SDXL: %.3f' % fid)

FID DPO vs SDXL: 215.046
FID RLHF vs SDXL: 246.931


The higher score of FID with SDXL shows that the image distribution of RLHF is less comparable to sdxl than dpo, **meaning dpo is more like sdxl than rlhf**.

# Structural Similarity Index Measure (SSIM)

In [14]:
import torch  
import torch.nn.functional as F 
import numpy as np
import math
from PIL import Image
import cv2

In [15]:
def gaussian(window_size, sigma):
    gauss =  torch.Tensor([math.exp(-(x - window_size//2)**2/float(2*sigma**2)) for x in range(window_size)])
    return gauss/gauss.sum()

def create_window(window_size, channel=1):
    _1d_window = gaussian(window_size=window_size, sigma=1.5).unsqueeze(1)
    _2d_window = _1d_window.mm(_1d_window.t()).float().unsqueeze(0).unsqueeze(0)
    window = torch.Tensor(_2d_window.expand(channel, 1, window_size, window_size).contiguous())
    return window

def ssim(img1, img2, val_range, window_size=11, window=None, size_average=True, full=False):
    L = val_range # L is the dynamic range of the pixel values (255 for 8-bit grayscale images),
    pad = window_size // 2
    try:
        _, channels, height, width = img1.size()
    except:
        channels, height, width = img1.size()
    # if window is not provided, init one
    if window is None: 
        real_size = min(window_size, height, width) # window should be atleast 11x11 
        window = create_window(real_size, channel=channels).to(img1.device)
    # calculating the mu parameter (locally) for both images using a gaussian filter 
    # calculates the luminosity params
    mu1 = F.conv2d(img1, window, padding=pad, groups=channels)
    mu2 = F.conv2d(img2, window, padding=pad, groups=channels)
    mu1_sq = mu1 ** 2
    mu2_sq = mu2 ** 2 
    mu12 = mu1 * mu2
    # now we calculate the sigma square parameter
    # Sigma deals with the contrast component 
    sigma1_sq = F.conv2d(img1 * img1, window, padding=pad, groups=channels) - mu1_sq
    sigma2_sq = F.conv2d(img2 * img2, window, padding=pad, groups=channels) - mu2_sq
    sigma12 =  F.conv2d(img1 * img2, window, padding=pad, groups=channels) - mu12
    # Some constants for stability 
    C1 = (0.01 ) ** 2  # NOTE: Removed L from here (ref PT implementation)
    C2 = (0.03 ) ** 2 
    contrast_metric = (2.0 * sigma12 + C2) / (sigma1_sq + sigma2_sq + C2)
    contrast_metric = torch.mean(contrast_metric)
    numerator1 = 2 * mu12 + C1  
    numerator2 = 2 * sigma12 + C2
    denominator1 = mu1_sq + mu2_sq + C1 
    denominator2 = sigma1_sq + sigma2_sq + C2
    ssim_score = (numerator1 * numerator2) / (denominator1 * denominator2)
    if size_average:
        ret = ssim_score.mean() 
    else: 
        ret = ssim_score.mean(1).mean(1).mean(1)
    if full:
        return ret, contrast_metric
    return ret

In [16]:
load_images = lambda x: np.asarray(Image.open(x).resize((480, 640)))
tensorify = lambda x: torch.Tensor(x.transpose((2, 0, 1))).unsqueeze(0).float().div(255.0)

In [17]:
ssim_rlhf=[]
ssim_dpo=[]
for i in range(len(prompts)):
    rlhf = load_images("RLHF_imgs/img"+str(i)+".png")
    dpo = load_images("DPO_imgs/img"+str(i)+".png")
    sdxl = load_images("SDXL_imgs/img"+str(i)+".png")
    _rlhf = tensorify(rlhf)
    _dpo = tensorify(dpo)
    _sdxl = tensorify(sdxl)
    rlhf_sdxl = ssim(_rlhf, _sdxl, val_range=255)
    dpo_sdxl = ssim(_dpo, _sdxl, val_range=255)
    ssim_rlhf.append(float(rlhf_sdxl))
    ssim_dpo.append(float(dpo_sdxl))

df["SSIM_rlhf"]=ssim_rlhf
df["SSIM_dpo"]=ssim_dpo

  tensorify = lambda x: torch.Tensor(x.transpose((2, 0, 1))).unsqueeze(0).float().div(255.0)


In [18]:
df["dpo_better_ssim"]=df["SSIM_dpo"]>=df["SSIM_rlhf"]
print("DPO was more structurally similar to  SDXL: ",(df[df["dpo_better_ssim"]==True].shape[0]/df.shape[0])*100, "%\nRLHF was more structurally similar to SDXL : ",(df[df["dpo_better_ssim"]==False].shape[0]/df.shape[0])*100,"%")

DPO was more structurally similar to  SDXL:  98.0 %
RLHF was more structurally similar to SDXL :  2.0 %


In [19]:
df.to_excel("Report 1.xlsx")

In [52]:
prompts[7]

'A bustling cyberpunk marketplace filled with neon signs, holograms, and robotic vendors.'

In [1]:
# 1 * prompt adherence
# 2 * comparitive analysis with sdxl
# 1 * prompt adherence from vote
# 1 * favourability index from vote

SyntaxError: invalid syntax (418614619.py, line 1)