# Evaluate Stable Diffusion Images

## Imports and load data

In [1]:
import os
import pandas as pd
import relational_image_generation_evaluation as rige
ROOT_PATH = os.path.join(os.getcwd(), '..')
EVALUATION_PATH = os.path.join(ROOT_PATH, 'datasets', 'evaluations', 'CC-500')
TO_EVALUATE = ['fa_maps-l', 'base-l', 'fa-acc_maps-l']
CC500 = True

In [14]:
import relational_image_generation_evaluation as rige
if not CC500:
    adv_dataset = rige.get_adversarial_attribute_dataset()
    orig_prompts, adv_prompts = rige.get_adv_prompt_list('attributes')
    for prompt, graph_dict in zip(orig_prompts, adv_dataset):
        assert prompt.split(' ')[0] in graph_dict['original_graph'].caption
    index_to_graph_dict = {i: graph_dict for i, graph_dict in enumerate(adv_dataset)}
else:
    adv_dataloader = rige.get_cc500_graph_dataloader()
    adv_dataset = adv_dataloader.dataset
    index_to_graph_dict = {i: graph_dict for i, graph_dict in enumerate(adv_dataset)}
    # filter out graphs with names or predicates not in rige.FILTERED_OBJECTS, rige.FILTERED_ATTRIBUTES, rige.FILTERED_RELATIONSHIPS
    for i in list(index_to_graph_dict.keys()):
        graph = index_to_graph_dict[i]
        if not all([ graph.nodes[n]['name'] in rige.FILTERED_OBJECTS for n in graph.nodes]):
            del index_to_graph_dict[i]
            continue
        if not all([ graph.edges[e]['predicate'] in rige.FILTERED_RELATIONSHIPS for e in graph.edges]):
            del index_to_graph_dict[i]
            continue
        if not all([ graph.nodes[n]['attributes'][i] in rige.FILTERED_ATTRIBUTES for n in graph.nodes for i in range(len(graph.nodes[n]['attributes']))]):
            del index_to_graph_dict[i]
            continue

    
    print(f'Filtered out {len(adv_dataset) - len(index_to_graph_dict)} graphs')
    print(f'Number of graphs: {len(index_to_graph_dict)}')
    first_graph = index_to_graph_dict[0]
    fg_node_names = [first_graph.nodes[n]['name'] for n in first_graph.nodes]
    fg_node_attributes = [first_graph.nodes[n]['attributes'] for n in first_graph.nodes]
    fg_edge_predicates = [first_graph.edges[e]['predicate'] for e in first_graph.edges]
    print(f'The first graph has nodes: {fg_node_names}, node attributes: {fg_node_attributes}, and edge predicates: {fg_edge_predicates}')

Using cached filtered graphs
Filtered out 254 graphs
Number of graphs: 349
The first graph has nodes: ['bench', 'car'], node attributes: [['green'], ['red']], and edge predicates: ['and', 'and']


In [16]:
def get_stablediffusion_images(image_folder_path, modelname):
    images = []
    for image_path in os.listdir(image_folder_path):
        if not image_path.endswith('.png'):
            continue
        # the image name has the form: index-og/adv_seed_I/II.png. We extract the index, og/adv, seed and I/II
        index = int(image_path.split('-')[0])
        og_adv = image_path.split('-')[1].split('_')[0]
        seed = image_path.split('_')[1]
        resolution = image_path.split('_')[2].split('.')[0]
        image_path = os.path.join(image_folder_path, image_path)
        # add the image to the dataframe
        images.append({
            'index': index, 
            'og_adv': og_adv, 
            'seed': seed, 
            'resolution': resolution, 
            'modelname': modelname, 
            'image_path': image_path,
            'original_graph': index_to_graph_dict[(index)]['original_graph'],
            'adv_graph': index_to_graph_dict[(index)]['adv_graph'],
        })
    print(len(images))
    return images
def get_stablediffusion_images_cc500(image_folder_path, modelname):
    images = []
    for image_path in os.listdir(image_folder_path):
        if not image_path.endswith('.png'):
            continue
        # the image name has the form: index_seed_I/II.png. We extract the index, seed and I/II
        index = int(image_path.split('_')[0])
        if index not in index_to_graph_dict:
            continue
        seed = image_path.split('_')[1]
        resolution = image_path.split('_')[2].split('.')[0]
        image_path = os.path.join(image_folder_path, image_path)
        # add the image to the dataframe
        images.append({
            'index': index, 
            'seed': seed, 
            'resolution': resolution, 
            'modelname': modelname, 
            'image_path': image_path,
            'graph': index_to_graph_dict[(index)],
        })
    print(len(images))
    return images
all_images = []
for foldername in TO_EVALUATE:
    path = os.path.join(EVALUATION_PATH, foldername)
    if CC500:
        dictlist = get_stablediffusion_images_cc500(path, foldername)
    else:
        dictlist = get_stablediffusion_images(path, foldername)
    all_images += dictlist
# convert the dictlist to a dataframe
all_images = pd.DataFrame(all_images)
display(all_images)
print(all_images.shape)
# print the type of the original graph in the first row
if not CC500:
    print(type(all_images['original_graph'][0]))
else:
    print(type(all_images['graph'][0]))

3110
3110
3110


Unnamed: 0,index,seed,resolution,modelname,image_path,graph
0,202,7122,II,fa_maps-l,/local/home/jthomm/GraphCLIP/notebooks/../data...,"(1, 2)"
1,28,48907,II,fa_maps-l,/local/home/jthomm/GraphCLIP/notebooks/../data...,"(1, 2)"
2,90,68319,II,fa_maps-l,/local/home/jthomm/GraphCLIP/notebooks/../data...,"(1, 2)"
3,89,8255,II,fa_maps-l,/local/home/jthomm/GraphCLIP/notebooks/../data...,"(1, 2)"
4,246,68245,II,fa_maps-l,/local/home/jthomm/GraphCLIP/notebooks/../data...,"(1, 2)"
...,...,...,...,...,...,...
9325,51,54235,II,fa-acc_maps-l,/local/home/jthomm/GraphCLIP/notebooks/../data...,"(1, 2)"
9326,350,47153,I,fa-acc_maps-l,/local/home/jthomm/GraphCLIP/notebooks/../data...,"(1, 2)"
9327,431,52383,I,fa-acc_maps-l,/local/home/jthomm/GraphCLIP/notebooks/../data...,"(1, 2)"
9328,52,14592,I,fa-acc_maps-l,/local/home/jthomm/GraphCLIP/notebooks/../data...,"(1, 2)"


(9330, 6)
<class 'networkx.classes.digraph.DiGraph'>


## Evaluation Method
We evaluate with the following method: We take the original graph and check if the original prompt image resembles it better than the adversarial prompt image.

That means, for each original prompt (i.e. for each index), we check the two images. We do this first only for low resolution and for each modelname

In [17]:
# get all indexes in the dataframe
indexes = all_images['index'].unique()
print('Evaluation on {} indexes'.format(len(indexes)))
print("max index: {}".format(max(indexes)))
print("min index: {}".format(min(indexes)))
print("missing indexes from 0 to inclusive 100: {}".format(set(range(101)) - set(indexes)))

Evaluation on 311 indexes
max index: 536
min index: 0
missing indexes from 0 to inclusive 100: {40, 41}


In [18]:
from PIL import Image
import numpy as np
def get_image_score(evaluator, graph, image_paths):
    # load the image
    images = []
    for image_path in image_paths:
        image = Image.open(image_path)
        images.append(image)
    # evaluate the image
    scores = evaluator(images, [graph for _ in range(len(images))])['attr_scores']
    # get the mean score
    # score = np.mean(scores)
    return scores

def get_scores(evaluator, modelname, resolution):
    # get all images with the given modelname and resolution
    images = all_images[(all_images['modelname'] == modelname) & (all_images['resolution'] == resolution)]
    # get all indexes
    indexes = images['index'].unique()
    # get the accuracy for each index
    scores = []
    for index in indexes:
        # get the original graph
        if not CC500:
            original_graph = images[images['index'] == index]['original_graph']
            assert len(set(g.caption for g in original_graph)) == 1
            original_graph = original_graph.iloc[0]
            adv_graph = images[images['index'] == index]['adv_graph'].iloc[0]
            # get the image paths, with different seeds
            og_image_paths = images[(images['index'] == index) & (images['og_adv'] == 'og')]['image_path'].values
            adv_image_paths = images[(images['index'] == index) & (images['og_adv'] == 'adv')]['image_path'].values
            # get the scores
            orig_graph_orig_img = get_image_score(evaluator, original_graph, og_image_paths)
            adv_graph_orig_img = get_image_score(evaluator, adv_graph, og_image_paths)
            orig_graph_adv_img = get_image_score(evaluator, original_graph, adv_image_paths)
            adv_graph_adv_img = get_image_score(evaluator, adv_graph, adv_image_paths)
            # get the accuracy
            scores.append({
                'index': index,
                'modelname': modelname,
                'resolution': resolution,
                'orig_graph_orig_img': orig_graph_orig_img,
                'orig_graph_adv_img': orig_graph_adv_img,
                'adv_graph_orig_img': adv_graph_orig_img,
                'adv_graph_adv_img': adv_graph_adv_img,
            })
        else:
            graph = images[images['index'] == index]['graph'].iloc[0]
            # get the image paths, with different seeds
            image_paths = images[(images['index'] == index)]['image_path'].values
            # get the scores
            graph_orig_img = get_image_score(evaluator, graph, image_paths)
            # get the accuracy
            scores.append({
                'index': index,
                'modelname': modelname,
                'resolution': resolution,
                'graph_orig_img': graph_orig_img,
            })
    return scores

In [19]:
local_weights_path = '/local/home/jthomm/GraphCLIP/experiments/2023-06-24/vision_transformer_8/model_epoch-v9.ckpt'
evaluator = rige.Evaluator('ViT-L/14', model_weights_path=local_weights_path)

Using device cuda:2 for evaluation.
Using text embeddings as input to the model.


In [20]:
score_dict = []
for modelname in TO_EVALUATE:
    for resolution in ['I', 'II']:
        scores = get_scores(evaluator, modelname, resolution)
        score_dict += scores
        print('Done with {} {}'.format(modelname, resolution))
# convert the dict to a dataframe
score_df = pd.DataFrame(score_dict)

Done with fa_maps-l I
Done with fa_maps-l II
Done with base-l I
Done with base-l II
Done with fa-acc_maps-l I
Done with fa-acc_maps-l II


In [21]:
# save the dataframe
NAME = 'stablediffusion_scores_cc500_datacomp'
display(score_df)
score_df.to_csv(os.path.join(EVALUATION_PATH, f'{NAME}.csv'), index=False)

Unnamed: 0,index,modelname,resolution,graph_orig_img
0,166,fa_maps-l,I,"[0.5008031725883484, 0.682695209980011, 0.5265..."
1,185,fa_maps-l,I,"[0.41138994693756104, 0.09675640612840652, 0.0..."
2,33,fa_maps-l,I,"[0.14894235134124756, 0.5421763062477112, 0.14..."
3,145,fa_maps-l,I,"[0.22690221667289734, 0.1112748384475708, 0.08..."
4,404,fa_maps-l,I,"[0.897138237953186, 0.06140827387571335, 0.516..."
...,...,...,...,...
1861,411,fa-acc_maps-l,II,"[0.3230156898498535, 0.3562612533569336, 0.448..."
1862,63,fa-acc_maps-l,II,"[0.2874911427497864, 0.388953298330307, 0.4739..."
1863,70,fa-acc_maps-l,II,"[0.4611344635486603, 0.446847140789032, 0.4353..."
1864,72,fa-acc_maps-l,II,"[0.46789029240608215, 0.4461095631122589, 0.38..."


**We calculate**
 * are images getting closer to the prompt attributes they were generated with? (both original, adversarial). Avg score and accuracy (between models)
 * are images closer to the prompt attributes they were generated with compared to the wrong attributes? Avg difference and accuracy

In [23]:
scores = score_df[(score_df['modelname'] == TO_EVALUATE[0]) & (score_df['resolution'] == 'I')]
if not CC500:
    display(scores['orig_graph_orig_img'])
    display(scores['orig_graph_adv_img'])
else:
    display(scores['graph_orig_img'])

0      [0.5008031725883484, 0.682695209980011, 0.5265...
1      [0.41138994693756104, 0.09675640612840652, 0.0...
2      [0.14894235134124756, 0.5421763062477112, 0.14...
3      [0.22690221667289734, 0.1112748384475708, 0.08...
4      [0.897138237953186, 0.06140827387571335, 0.516...
                             ...                        
306    [0.35735902190208435, 0.3852696716785431, 0.45...
307    [0.8107014298439026, 0.4438059329986572, 0.111...
308    [0.47578561305999756, 0.4866176247596741, 0.44...
309    [0.13153913617134094, 0.4637344479560852, 0.02...
310    [0.6270792484283447, 0.8730216026306152, 0.495...
Name: graph_orig_img, Length: 311, dtype: object

In [24]:
# using the scoredf, we can now plot the results
evaluation_results = {}

def flatten(name, substitute_scores=None):
    if substitute_scores is not None:
        return np.array(list(substitute_scores[name].values)).reshape(-1)
    return np.array(list(scores[name].values)).reshape(-1)

for modelname in TO_EVALUATE:
    for resolution in ['I', 'II']:
        if not CC500:
            scores =          score_df[(score_df['modelname'] == modelname) & (score_df['resolution'] == resolution)]
            scores_baseline = score_df[(score_df['modelname'] == 'base-l') & (score_df['resolution'] == resolution)]
            print('Modelname: {}, resolution: {}'.format(modelname, resolution))
            avg_score_origimg_to_origprompt = flatten('orig_graph_orig_img').mean()
            
            avg_score_advimg_to_advprompt = flatten('adv_graph_adv_img').mean()

            # for the origimage: for each image, subtract score origprompt - score advprompt. Then take the mean. Take care to always subtract the same index

            
            avg_promptdiff_origimg = flatten('orig_graph_orig_img') - flatten('adv_graph_orig_img')
            acc_promptdiff_origimg = (avg_promptdiff_origimg > 0).mean()
            avg_promptdiff_origimg = avg_promptdiff_origimg.mean()

            avg_promptdiff_advimg = flatten('adv_graph_adv_img') - flatten('orig_graph_adv_img')
            acc_promptdiff_advimg = (avg_promptdiff_advimg > 0).mean()
            avg_promptdiff_advimg = avg_promptdiff_advimg.mean()

            acc_origprompt_rightimgbetter = (flatten('orig_graph_orig_img')>flatten('orig_graph_adv_img')).mean()
            acc_advprompt_rightimgbetter = (flatten('adv_graph_adv_img')>flatten('adv_graph_orig_img')).mean()

            acc_to_baseline_origimg = (flatten('orig_graph_orig_img')>flatten('orig_graph_orig_img', scores_baseline)).mean()
            acc_to_baseline_advimg = (flatten('adv_graph_adv_img')>flatten('adv_graph_adv_img', scores_baseline)).mean()

            evaluation_results[(modelname, resolution)] = {
                'avg_score_origimg_to_origprompt': avg_score_origimg_to_origprompt,
                'avg_score_advimg_to_advprompt': avg_score_advimg_to_advprompt,
                'avg_prompt_diff_origimg': avg_promptdiff_origimg,
                'avg_prompt_diff_advimg': avg_promptdiff_advimg,
                'acc_rightpromptbetter_origimg': acc_promptdiff_origimg,
                'acc_rightpromptbetter_advimg': acc_promptdiff_advimg,
                'acc_origprompt_rightimgbetter': acc_origprompt_rightimgbetter,
                'acc_advprompt_rightimgbetter': acc_advprompt_rightimgbetter,
                'acc_betterthan_baseline_origimg': acc_to_baseline_origimg,
                'acc_betterthan_baseline_advimg': acc_to_baseline_advimg,
            }
        else:
            scores =          score_df[(score_df['modelname'] == modelname) & (score_df['resolution'] == resolution)]
            scores_baseline = score_df[(score_df['modelname'] == 'base-l') & (score_df['resolution'] == resolution)]
            print('Modelname: {}, resolution: {}'.format(modelname, resolution))
            avg_score_img_to_prompt = flatten('graph_orig_img').mean()

            # for the origimage: for each image, subtract score origprompt - score advprompt. Then take the mean. Take care to always subtract the same index

            acc_to_baseline_origimg = (flatten('graph_orig_img')>flatten('graph_orig_img', scores_baseline)).mean()

            evaluation_results[(modelname, resolution)] = {
                'avg_score_img_to_prompt': avg_score_img_to_prompt,
                'acc_betterthan_baseline_origimg': acc_to_baseline_origimg,
            }
results = pd.DataFrame(evaluation_results).T

Modelname: fa_maps-l, resolution: I
Modelname: fa_maps-l, resolution: II
Modelname: base-l, resolution: I
Modelname: base-l, resolution: II
Modelname: fa-acc_maps-l, resolution: I
Modelname: fa-acc_maps-l, resolution: II


In [25]:
display(results)
# save the results
results.round(2).to_csv(os.path.join(EVALUATION_PATH, f'{NAME}.csv'))
print('Saved results to {}'.format(os.path.join(EVALUATION_PATH, f'{NAME}.csv')))


Unnamed: 0,Unnamed: 1,avg_score_img_to_prompt,acc_betterthan_baseline_origimg
fa_maps-l,I,0.451312,0.572347
fa_maps-l,II,0.437969,0.565273
base-l,I,0.408596,0.0
base-l,II,0.399803,0.0
fa-acc_maps-l,I,0.42829,0.571704
fa-acc_maps-l,II,0.415928,0.552412


Saved results to /local/home/jthomm/GraphCLIP/notebooks/../datasets/evaluations/CC-500/stablediffusion_scores_cc500_datacomp.csv


In [12]:
# load the results
results = pd.read_csv(os.path.join(EVALUATION_PATH, f'stablediffusion_results_adv_attr.csv'), index_col=0)
display(results)

Unnamed: 0,Unnamed: 1,avg_score_origimg_to_origprompt,avg_score_advimg_to_advprompt,avg_prompt_diff_origimg,avg_prompt_diff_advimg,acc_rightpromptbetter_origimg,acc_rightpromptbetter_advimg,acc_origprompt_rightimgbetter,acc_advprompt_rightimgbetter,acc_betterthan_baseline_origimg,acc_betterthan_baseline_advimg
fa_maps-l,I,0.417946,0.365815,0.23328,0.134635,0.795506,0.624719,0.759551,0.777528,0.521348,0.541573
fa_maps-l,II,0.417944,0.367578,0.237425,0.134765,0.8,0.635955,0.759551,0.764045,0.534831,0.568539
base-l,I,0.408328,0.341877,0.206241,0.085719,0.766292,0.559551,0.741573,0.725843,0.0,0.0
base-l,II,0.405703,0.342797,0.202878,0.083703,0.741573,0.579775,0.737079,0.692135,0.0,0.0
fa-acc_maps-l,I,0.408155,0.357213,0.227327,0.116546,0.773034,0.6,0.757303,0.773034,0.489888,0.521348
fa-acc_maps-l,II,0.409692,0.356338,0.227978,0.12034,0.766292,0.613483,0.716854,0.761798,0.541573,0.550562
