# Stable Diffusion v1 tokenizer and embedding

This notebook examines tokens and embeddings used in Stable Diffusion v1.

Tutorials, prompts and resources at https://stable-diffusion-art.com

Modified from [Interacting with CLIP notebook](https://colab.research.google.com/github/openai/clip/blob/master/notebooks/Interacting_with_CLIP.ipynb).

# Setup


In [None]:
! pip install ftfy regex tqdm
! pip install git+https://github.com/openai/CLIP.git

import numpy as np
import torch
from pkg_resources import packaging
print("Torch version:", torch.__version__)

import clip
print('Available models:')
print(clip.available_models())

model, preprocess = clip.load("ViT-L/14") # used by stable diffusion v1
model.cuda().eval()
input_resolution = model.visual.input_resolution
context_length = model.context_length
vocab_size = model.vocab_size

print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")
print("Input resolution:", input_resolution)
print("Context length:", context_length)
print("Vocab size:", vocab_size)


Collecting ftfy
  Downloading ftfy-6.1.3-py3-none-any.whl (53 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/53.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m51.2/53.4 kB[0m [31m1.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.4/53.4 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Collecting wcwidth<0.3.0,>=0.2.12 (from ftfy)
  Downloading wcwidth-0.2.12-py2.py3-none-any.whl (34 kB)
Installing collected packages: wcwidth, ftfy
  Attempting uninstall: wcwidth
    Found existing installation: wcwidth 0.2.10
    Uninstalling wcwidth-0.2.10:
      Successfully uninstalled wcwidth-0.2.10
Successfully installed ftfy-6.1.3 wcwidth-0.2.12


Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-v5rzg3fj
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-v5rzg3fj
  Resolved https://github.com/openai/CLIP.git to commit a1d071733d7111c9c014f024669f959182114e33
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25l[?25hdone
  Created wheel for clip: filename=clip-1.0-py3-none-any.whl size=1369500 sha256=154c892adef7967ec9cdece9f8650d9e8f40286102e9a87e138fba37457581df
  Stored in directory: /tmp/pip-ephem-wheel-cache-p9jy548o/wheels/da/2b/4c/d6691fa9597aac8bb85d2ac13b112deb897d5b50f5ad9a37e4
Successfully built clip
Installing collected packages: clip
Successfully installed clip-1.0
Torch version: 2.1.0+cu118
Available models:
['RN50', 'RN101', 'RN50x4', 'RN50x16', 'RN50x64', 'ViT-B/32', 'ViT-B/16', 'ViT-L/14', 'ViT-L/1

100%|███████████████████████████████████████| 890M/890M [00:12<00:00, 76.7MiB/s]


Model parameters: 427,616,513
Input resolution: 224
Context length: 77
Vocab size: 49408


# Token and embedding
Modify prompt to see tokens and embeddings

In [None]:
# modify prompt to check tokens
prompt = "Photo of a cat"

tokens = clip.tokenize(prompt)
with torch.no_grad():
    embeddings = model.token_embedding(tokens.cuda()).float()
print("text tokens:")
print(tokens)
print("text tokens size:", tokens.shape)
print("Embeddings size:", embeddings.shape )

text tokens:
tensor([[49406,  1125,   539,   320,  2368, 49407,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0]], dtype=torch.int32)
text tokens size: torch.Size([1, 77])
Embeddings size: torch.Size([1, 77, 768])
