In [None]:
import diffusers
from diffusers import AutoencoderKL, DPMSolverMultistepScheduler, StableDiffusionPipeline, UNet2DConditionModel
from diffusers.optimization import get_scheduler
from diffusers.training_utils import EMAModel, compute_dream_and_update_latents, compute_snr
from diffusers.utils import check_min_version, deprecate, is_wandb_available, make_image_grid
from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
from diffusers.utils.import_utils import is_xformers_available
from diffusers.utils.torch_utils import is_compiled_module
import torch

In [None]:
model_id = "stabilityai/stable-diffusion-2-1"

pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
pipe = pipe.to("cuda")


In [None]:
# prompts = ["gift", "horse", "jeep", "shaking hands", "oxygen mask", "pay phone"]

# for prompt in prompts:
#     image = pipe(prompt).images[0]
#     image.show()

In [None]:
from urllib.request import urlopen
from PIL import Image
import timm

img = Image.open(urlopen(
    'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png'
))

model = timm.create_model('vit_so150m2_patch16_reg1_gap_256.sbb_e200_in12k_ft_in1k', pretrained=True)
model = model.eval()

# get model specific transforms (normalization, resize)
data_config = timm.data.resolve_model_data_config(model)
transforms = timm.data.create_transform(**data_config, is_training=False)

output = model(transforms(img).unsqueeze(0))  # unsqueeze single image into batch of 1

top5_probabilities, top5_class_indices = torch.topk(output.softmax(dim=1) * 100, k=5)


In [None]:
from collections import defaultdict

In [None]:
prompts = ["goldfish", "jeep", "oxygen mask", "pay phone"]
true_idxs = [1, 609, 691, 707]
images_and_results = defaultdict(list)

accuracies = defaultdict(list)
num_trials = 10
pipe.set_progress_bar_config(disable=True)


for prompt, true_idx in zip(prompts, true_idxs):
    for trial in range(num_trials):
        image = pipe(prompt).images[0]
        # image.show()

        model_output = model(transforms(image).unsqueeze(0))
        top5_probabilities, top5_class_indices = torch.topk(model_output.softmax(dim=1) * 100, k=5)

        top_pred = top5_class_indices[0, 0].item()
        accuracies[prompt].append(true_idx == top_pred)
        images_and_results[prompt].append((image, top5_probabilities, top5_class_indices))

for c in accuracies:
    print(f"Accuracy for class {c} = {sum(accuracies[c]) / len(accuracies[c])}")

In [None]:
for c in accuracies:
    print(f"Accuracy for class {c} = {sum(accuracies[c]) / len(accuracies[c])}")

In [None]:
for p in images_and_results:
    print(f"Images for {prompt}")

    [i[0].show() for i in images_and_results[p]]

In [None]:
import matplotlib.pyplot as plt 

frequencies = [145486, 3228439, 10616, 7136]
accs = [1.0, 1.0, 0.875, 0.525]
labels = ['goldfish', 'jeep', 'oxygen mask', 'pay phone']

fig, ax = plt.subplots()
ax.plot(frequencies, accs, marker='o', linewidth=2)  # numeric x- and y-axes implied
ax.set_xlabel('n-Gram Frequency')
ax.set_ylabel('Classifier Accuracy')
ax.set_xscale('log')
ax.invert_xaxis()
for xi, yi, lab in zip(frequencies, accs, labels):
    ax.annotate(lab, (xi, yi), xytext=(5, 5), textcoords='offset points')

plt.tight_layout()
plt.show()
