In [1]:
!nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv,noheader

NVIDIA A100-PCIE-40GB, 40960 MiB, 19744 MiB
NVIDIA A100-PCIE-40GB, 40960 MiB, 28478 MiB
NVIDIA A100-PCIE-40GB, 40960 MiB, 19124 MiB


In [1]:
import torch
import os
from diffusers import StableDiffusionPipeline
from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer

SEED = None
DEVICE = torch.device("cuda:1") if torch.cuda.is_available() else torch.device("cpu")

pretrained_model_name_or_path = "/root/autodl-tmp/stable_diffusion/stable-diffusion-v1-5"
learned_embeds_path = "/root/autodl-tmp/textual_inversion/trained_embeddings/custom_cat/original/learned_embeds.bin"
all_embedding_path = os.path.dirname(learned_embeds_path)
dataset_path, _ = os.path.split(all_embedding_path)
_, dataset_name = os.path.split(dataset_path)

tokenizer = CLIPTokenizer.from_pretrained(
    pretrained_model_name_or_path,
    subfolder="tokenizer",
)
text_encoder = CLIPTextModel.from_pretrained(
    pretrained_model_name_or_path, 
    subfolder="text_encoder",
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
loaded_learned_embeds = torch.load(learned_embeds_path, map_location="cpu")
  
# separate token and the embeds
trained_token = list(loaded_learned_embeds.keys())[0]
embeds = loaded_learned_embeds[trained_token]
print("placeholder token for dataset {}:".format(dataset_name), trained_token)

# cast to dtype of text_encoder
dtype = text_encoder.get_input_embeddings().weight.dtype
embeds.to(dtype)

vocab_num = len(tokenizer)
vocab_embedding = text_encoder.get_input_embeddings().weight.data

placeholder token for dataset custom_cat: <custom_cat>


In [19]:
special_token = "pineapple"
special_id = tokenizer.convert_tokens_to_ids(special_token)
special_embedding = text_encoder.get_input_embeddings().weight.data[special_id]

In [20]:
# calculate vector product
vocab_dist = torch.mm(vocab_embedding, special_embedding.unsqueeze(1)).squeeze(1)

In [21]:
vocab_min, vocab_min_ids = torch.topk(vocab_dist, 5, 0, False)
pseudo_word = tokenizer.convert_ids_to_tokens(vocab_min_ids)
print(pseudo_word)

['rockstars</w>', 'plasticpollution</w>', 'piccadilly</w>', 'whitfield</w>', 'northernlights</w>']


In [15]:
print(tokenizer.convert_tokens_to_ids("banana"))

49407


In [4]:
# calculate Lp distance
embeds_matrix = special_embedding.unsqueeze(0).expand(vocab_num, special_embedding.shape[0]) # embeds
residual_matrix = embeds_matrix - vocab_embedding
vocab_dist = torch.norm(residual_matrix, 2, 1)

In [10]:
# calculate cosine similarity
embeds_matrix = embeds.unsqueeze(0).expand(vocab_num, embeds.shape[0])
vocab_dist = torch.cosine_similarity(embeds_matrix, vocab_embedding, 0)