# Evaluate Stable Diffusion Images

## Imports and load data

In [1]:
import os
import pandas as pd
import relational_image_generation_evaluation as rige
ROOT_PATH = os.path.join(os.getcwd(), '..')
EVALUATION_PATH = os.path.join(ROOT_PATH, 'datasets', 'evaluations')
TO_EVALUATE = ['fa_maps-l', 'base-l', 'fa-acc_maps-l']



In [2]:
import relational_image_generation_evaluation as rige
adv_dataset = rige.get_adversarial_attribute_dataset()
orig_prompts, adv_prompts = rige.get_adv_prompt_list('attributes')
for prompt, graph_dict in zip(orig_prompts, adv_dataset):
    assert prompt.split(' ')[0] in graph_dict['original_graph'].caption
index_to_graph_dict = {i: graph_dict for i, graph_dict in enumerate(adv_dataset)}

In [3]:
def get_stablediffusion_images(image_folder_path, modelname):
    images = []
    for image_path in os.listdir(image_folder_path):
        if not image_path.endswith('.png'):
            continue
        # the image name has the form: index-og/adv_seed_I/II.png. We extract the index, og/adv, seed and I/II
        index = int(image_path.split('-')[0])
        og_adv = image_path.split('-')[1].split('_')[0]
        seed = image_path.split('_')[1]
        resolution = image_path.split('_')[2].split('.')[0]
        image_path = os.path.join(image_folder_path, image_path)
        # add the image to the dataframe
        images.append({
            'index': index, 
            'og_adv': og_adv, 
            'seed': seed, 
            'resolution': resolution, 
            'modelname': modelname, 
            'image_path': image_path,
            'original_graph': index_to_graph_dict[(index)]['original_graph'],
            'adv_graph': index_to_graph_dict[(index)]['adv_graph'],
        })
    print(len(images))
    return images
all_images = []
for foldername in TO_EVALUATE:
    path = os.path.join(EVALUATION_PATH, foldername)
    dictlist = get_stablediffusion_images(path, foldername)
    all_images += dictlist
# convert the dictlist to a dataframe
all_images = pd.DataFrame(all_images)
display(all_images)
print(all_images.shape)
# print the type of the original graph in the first row
print(type(all_images['original_graph'][0]))

1780
1780
1780


Unnamed: 0,index,og_adv,seed,resolution,modelname,image_path,original_graph,adv_graph
0,85,adv,81634,II,fa_maps-l,/local/home/jthomm/GraphCLIP/notebooks/../data...,"(1063794, 1063798)","(1063794, 1063798)"
1,12,og,9904,I,fa_maps-l,/local/home/jthomm/GraphCLIP/notebooks/../data...,"(1075423, 1075453)","(1075423, 1075453)"
2,34,adv,69919,II,fa_maps-l,/local/home/jthomm/GraphCLIP/notebooks/../data...,"(1082031, 1082037)","(1082031, 1082037)"
3,21,adv,20138,II,fa_maps-l,/local/home/jthomm/GraphCLIP/notebooks/../data...,"(4460919, 4460945)","(4460919, 4460945)"
4,87,adv,16331,II,fa_maps-l,/local/home/jthomm/GraphCLIP/notebooks/../data...,"(389390, 389404)","(389390, 389404)"
...,...,...,...,...,...,...,...,...
5335,73,adv,40573,I,fa-acc_maps-l,/local/home/jthomm/GraphCLIP/notebooks/../data...,"(1062580, 1062590)","(1062580, 1062590)"
5336,36,adv,24219,II,fa-acc_maps-l,/local/home/jthomm/GraphCLIP/notebooks/../data...,"(1084277, 1084278)","(1084277, 1084278)"
5337,54,adv,3324,II,fa-acc_maps-l,/local/home/jthomm/GraphCLIP/notebooks/../data...,"(250433, 250441)","(250433, 250441)"
5338,89,adv,34489,I,fa-acc_maps-l,/local/home/jthomm/GraphCLIP/notebooks/../data...,"(393586, 393591)","(393586, 393591)"


(5340, 8)
<class 'networkx.classes.digraph.DiGraph'>


## Evaluation Method
We evaluate with the following method: We take the original graph and check if the original prompt image resembles it better than the adversarial prompt image.

That means, for each original prompt (i.e. for each index), we check the two images. We do this first only for low resolution and for each modelname

In [4]:
# get all indexes in the dataframe
indexes = all_images['index'].unique()
print('Evaluation on {} indexes'.format(len(indexes)))
print("max index: {}".format(max(indexes)))
print("min index: {}".format(min(indexes)))
print("missing indexes from 0 to inclusive 100: {}".format(set(range(101)) - set(indexes)))

Evaluation on 89 indexes
max index: 99
min index: 0
missing indexes from 0 to inclusive 100: {32, 100, 77, 79, 19, 23, 24, 90, 91, 61, 94, 95}


In [5]:
from PIL import Image
import numpy as np
def get_image_score(evaluator, graph, image_paths):
    # load the image
    images = []
    for image_path in image_paths:
        image = Image.open(image_path)
        images.append(image)
    # evaluate the image
    scores = evaluator(images, [graph for _ in range(len(images))])['attr_scores']
    # get the mean score
    # score = np.mean(scores)
    return scores

def get_scores(evaluator, modelname, resolution):
    # get all images with the given modelname and resolution
    images = all_images[(all_images['modelname'] == modelname) & (all_images['resolution'] == resolution)]
    # get all indexes
    indexes = images['index'].unique()
    # get the accuracy for each index
    scores = []
    for index in indexes:
        # get the original graph
        original_graph = images[images['index'] == index]['original_graph']
        assert len(set(g.caption for g in original_graph)) == 1
        original_graph = original_graph.iloc[0]
        adv_graph = images[images['index'] == index]['adv_graph'].iloc[0]
        # get the image paths, with different seeds
        og_image_paths = images[(images['index'] == index) & (images['og_adv'] == 'og')]['image_path'].values
        adv_image_paths = images[(images['index'] == index) & (images['og_adv'] == 'adv')]['image_path'].values
        # get the scores
        orig_graph_orig_img = get_image_score(evaluator, original_graph, og_image_paths)
        adv_graph_orig_img = get_image_score(evaluator, adv_graph, og_image_paths)
        orig_graph_adv_img = get_image_score(evaluator, original_graph, adv_image_paths)
        adv_graph_adv_img = get_image_score(evaluator, adv_graph, adv_image_paths)
        # get the accuracy
        scores.append({
            'index': index,
            'modelname': modelname,
            'resolution': resolution,
            'orig_graph_orig_img': orig_graph_orig_img,
            'orig_graph_adv_img': orig_graph_adv_img,
            'adv_graph_orig_img': adv_graph_orig_img,
            'adv_graph_adv_img': adv_graph_adv_img,
        })
    return scores

In [6]:
evaluator = rige.Evaluator('ViT-L/14')

Using device cuda:0 for evaluation.


Using text embeddings as input to the model.


In [7]:
score_dict = []
for modelname in TO_EVALUATE:
    for resolution in ['I', 'II']:
        scores = get_scores(evaluator, modelname, resolution)
        score_dict += scores
        print('Done with {} {}'.format(modelname, resolution))
# convert the dict to a dataframe
score_df = pd.DataFrame(score_dict)

Done with fa_maps-l I
Done with fa_maps-l II
Done with base-l I
Done with base-l II
Done with fa-acc_maps-l I
Done with fa-acc_maps-l II


In [8]:
# save the dataframe
display(score_df)
score_df.to_csv(os.path.join(EVALUATION_PATH, 'stablediffusion_scores.csv'), index=False)

Unnamed: 0,index,modelname,resolution,orig_graph_orig_img,orig_graph_adv_img,adv_graph_orig_img,adv_graph_adv_img
0,12,fa_maps-l,I,"[0.9110567569732666, 0.2085813581943512, 0.846...","[0.41327694058418274, 0.3210293650627136, 0.05...","[0.1517191380262375, 0.9044969081878662, 0.212...","[0.41431713104248047, 0.7648022174835205, 0.87..."
1,35,fa_maps-l,I,"[0.6436792016029358, 0.3527820408344269, 0.318...","[0.045505091547966, 0.00927270669490099, 0.008...","[0.026472220197319984, 0.012272626161575317, 0...","[0.5355663895606995, 0.4186086654663086, 0.314..."
2,73,fa_maps-l,I,"[0.22231510281562805, 0.1891142725944519, 0.24...","[0.02679286152124405, 0.06348323076963425, 0.0...","[0.057981546968221664, 0.05751557648181915, 0....","[0.20694607496261597, 0.31186601519584656, 0.3..."
3,84,fa_maps-l,I,"[0.5530276298522949, 0.010326573625206947, 0.8...","[0.25072401762008667, 0.006587476469576359, 0....","[0.3398905098438263, 0.4908527731895447, 0.043...","[0.8446298837661743, 0.4017374813556671, 0.253..."
4,38,fa_maps-l,I,"[0.03150453418493271, 0.03358066454529762, 0.0...","[0.041861191391944885, 0.053201813250780106, 0...","[0.054191794246435165, 0.040819115936756134, 0...","[0.0922757238149643, 0.05250931158661842, 0.06..."
...,...,...,...,...,...,...,...
529,13,fa-acc_maps-l,II,"[0.5499901175498962, 0.6889024376869202, 0.513...","[0.04960433021187782, 0.11156313121318817, 0.1...","[0.03618011251091957, 0.0064824954606592655, 0...","[0.5016810297966003, 0.47700437903404236, 0.15..."
530,63,fa-acc_maps-l,II,"[0.7237799763679504, 0.3732290267944336, 0.413...","[0.023193495348095894, 0.0782419815659523, 0.0...","[0.030534837394952774, 0.13410690426826477, 0....","[0.5199568867683411, 0.47709494829177856, 0.61..."
531,11,fa-acc_maps-l,II,"[0.2699306309223175, 0.6769900321960449, 0.661...","[0.1028711199760437, 0.5019867420196533, 0.137...","[0.013329126872122288, 0.03107333928346634, 0....","[0.16706672310829163, 0.4529421329498291, 0.04..."
532,40,fa-acc_maps-l,II,"[0.525382399559021, 0.8547845482826233, 0.8436...","[0.16875597834587097, 0.5971121788024902, 0.36...","[0.14129538834095, 0.022355539724230766, 0.373...","[0.43036219477653503, 0.03346829488873482, 0.1..."


**We calculate**
 * are images getting closer to the prompt attributes they were generated with? (both original, adversarial). Avg score and accuracy (between models)
 * are images closer to the prompt attributes they were generated with compared to the wrong attributes? Avg difference and accuracy

In [9]:
scores = score_df[(score_df['modelname'] == TO_EVALUATE[0]) & (score_df['resolution'] == 'I')]
display(scores['orig_graph_orig_img'])
display(scores['orig_graph_adv_img'])

0     [0.9110567569732666, 0.2085813581943512, 0.846...
1     [0.6436792016029358, 0.3527820408344269, 0.318...
2     [0.22231510281562805, 0.1891142725944519, 0.24...
3     [0.5530276298522949, 0.010326573625206947, 0.8...
4     [0.03150453418493271, 0.03358066454529762, 0.0...
                            ...                        
84    [0.2383803129196167, 0.20662599802017212, 0.21...
85    [0.24958015978336334, 0.11756513267755508, 0.4...
86    [0.16981592774391174, 0.3523048162460327, 0.21...
87    [0.7564706802368164, 0.7364174127578735, 0.852...
88    [0.3262401223182678, 0.2709515690803528, 0.351...
Name: orig_graph_orig_img, Length: 89, dtype: object

0     [0.41327694058418274, 0.3210293650627136, 0.05...
1     [0.045505091547966, 0.00927270669490099, 0.008...
2     [0.02679286152124405, 0.06348323076963425, 0.0...
3     [0.25072401762008667, 0.006587476469576359, 0....
4     [0.041861191391944885, 0.053201813250780106, 0...
                            ...                        
84    [0.0036239782348275185, 0.004145706072449684, ...
85    [0.28286927938461304, 0.2240184098482132, 0.02...
86    [0.050397131592035294, 0.1707591414451599, 0.1...
87    [0.7353438138961792, 0.277669221162796, 0.1910...
88    [0.2597760856151581, 0.39673274755477905, 0.35...
Name: orig_graph_adv_img, Length: 89, dtype: object

In [24]:
# using the scoredf, we can now plot the results
evaluation_results = {}

def flatten(name, substitute_scores=None):
    if substitute_scores is not None:
        return np.array(list(substitute_scores[name].values)).reshape(-1)
    return np.array(list(scores[name].values)).reshape(-1)

for modelname in TO_EVALUATE:
    for resolution in ['I', 'II']:
        scores =          score_df[(score_df['modelname'] == modelname) & (score_df['resolution'] == resolution)]
        scores_baseline = score_df[(score_df['modelname'] == 'base-l') & (score_df['resolution'] == resolution)]
        print('Modelname: {}, resolution: {}'.format(modelname, resolution))
        avg_score_origimg_to_origprompt = flatten('orig_graph_orig_img').mean()
        
        avg_score_advimg_to_advprompt = flatten('adv_graph_adv_img').mean()

        # for the origimage: for each image, subtract score origprompt - score advprompt. Then take the mean. Take care to always subtract the same index

        
        avg_promptdiff_origimg = flatten('orig_graph_orig_img') - flatten('adv_graph_orig_img')
        acc_promptdiff_origimg = (avg_promptdiff_origimg > 0).mean()
        avg_promptdiff_origimg = avg_promptdiff_origimg.mean()

        avg_promptdiff_advimg = flatten('adv_graph_adv_img') - flatten('orig_graph_adv_img')
        acc_promptdiff_advimg = (avg_promptdiff_advimg > 0).mean()
        avg_promptdiff_advimg = avg_promptdiff_advimg.mean()

        acc_origprompt_rightimgbetter = (flatten('orig_graph_orig_img')>flatten('orig_graph_adv_img')).mean()
        acc_advprompt_rightimgbetter = (flatten('adv_graph_adv_img')>flatten('adv_graph_orig_img')).mean()

        acc_to_baseline_origimg = (flatten('orig_graph_orig_img')>flatten('orig_graph_orig_img', scores_baseline)).mean()
        acc_to_baseline_advimg = (flatten('adv_graph_adv_img')>flatten('adv_graph_adv_img', scores_baseline)).mean()

        evaluation_results[(modelname, resolution)] = {
            'avg_score_origimg_to_origprompt': avg_score_origimg_to_origprompt,
            'avg_score_advimg_to_advprompt': avg_score_advimg_to_advprompt,
            'avg_prompt_diff_origimg': avg_promptdiff_origimg,
            'avg_prompt_diff_advimg': avg_promptdiff_advimg,
            'acc_rightpromptbetter_origimg': acc_promptdiff_origimg,
            'acc_rightpromptbetter_advimg': acc_promptdiff_advimg,
            'acc_origprompt_rightimgbetter': acc_origprompt_rightimgbetter,
            'acc_advprompt_rightimgbetter': acc_advprompt_rightimgbetter,
            'acc_betterthan_baseline_origimg': acc_to_baseline_origimg,
            'acc_betterthan_baseline_advimg': acc_to_baseline_advimg,
        }
results = pd.DataFrame(evaluation_results).T

Modelname: fa_maps-l, resolution: I
Modelname: fa_maps-l, resolution: II
Modelname: base-l, resolution: I
Modelname: base-l, resolution: II
Modelname: fa-acc_maps-l, resolution: I
Modelname: fa-acc_maps-l, resolution: II


In [26]:
display(results)
# save the results
results.to_csv(os.path.join(EVALUATION_PATH, 'stablediffusion_results.csv'))
print('Saved results to {}'.format(os.path.join(EVALUATION_PATH, 'stablediffusion_results.csv')))


Unnamed: 0,Unnamed: 1,avg_score_origimg_to_origprompt,avg_score_advimg_to_advprompt,avg_prompt_diff_origimg,avg_prompt_diff_advimg,acc_rightpromptbetter_origimg,acc_rightpromptbetter_advimg,acc_origprompt_rightimgbetter,acc_advprompt_rightimgbetter,acc_betterthan_baseline_origimg,acc_betterthan_baseline_advimg
fa_maps-l,I,0.417946,0.365815,0.23328,0.134635,0.795506,0.624719,0.759551,0.777528,0.521348,0.541573
fa_maps-l,II,0.417944,0.367578,0.237425,0.134765,0.8,0.635955,0.759551,0.764045,0.534831,0.568539
base-l,I,0.408328,0.341877,0.206241,0.085719,0.766292,0.559551,0.741573,0.725843,0.0,0.0
base-l,II,0.405703,0.342797,0.202878,0.083703,0.741573,0.579775,0.737079,0.692135,0.0,0.0
fa-acc_maps-l,I,0.408155,0.357213,0.227327,0.116546,0.773034,0.6,0.757303,0.773034,0.489888,0.521348
fa-acc_maps-l,II,0.409692,0.356338,0.227978,0.12034,0.766292,0.613483,0.716854,0.761798,0.541573,0.550562


Saved results to /local/home/jthomm/GraphCLIP/notebooks/../datasets/evaluations/stablediffusion_results.csv
