In [None]:
from google.colab import drive
drive.mount('/content/drive')

import sys
sys.path.append('/content/drive/MyDrive/cs229/sd_data/')
%cd drive/MyDrive/cs229/sd_data

In [None]:
#@title Import required libraries
%%capture
!pip install -qq diffusers==0.4.1 transformers ftfy
!pip install -qq "ipywidgets>=7,<8"

In [None]:
from huggingface_hub import notebook_login
notebook_login()

#@title Setup
import os
import torch

import PIL
from PIL import Image

from diffusers import StableDiffusionPipeline
from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer

def image_grid(imgs, rows, cols):
    assert len(imgs) == rows*cols

    w, h = imgs[0].size
    grid = Image.new('RGB', size=(cols*w, rows*h))
    grid_w, grid_h = grid.size
    
    for i, img in enumerate(imgs):
        grid.paste(img, box=(i%cols*w, i//cols*h))
    return grid



from huggingface_hub import HfApi
import requests

api = HfApi()
models_list = api.list_models(author="sd-concepts-library")

def get_concept_type(model_obj):
    concept_type = f"https://huggingface.co/{model_obj.modelId}/raw/main/type_of_concept.txt"
    response = requests.get(concept_type)
    return response.text

from tqdm import tqdm
model_id_to_type = {}
for x in tqdm(models_list):
    model_id_to_type[x.modelId] = get_concept_type(x)

word_to_type = {}
for model_id in model_id_to_type:
    word = model_id.split('/')[-1]
    word_to_type[word] = model_id_to_type[model_id]


import pickle
with open("all_added_emb.pickle", "rb") as f:
    all_added_emb = pickle.load(f)

Style Interpolation

In [None]:
# Stable Diffusion
pretrained_model_name_or_path = "CompVis/stable-diffusion-v1-4" 
tokenizer = CLIPTokenizer.from_pretrained(
    pretrained_model_name_or_path,
    subfolder="tokenizer",
)
text_encoder = CLIPTextModel.from_pretrained(
    pretrained_model_name_or_path, subfolder="text_encoder", torch_dtype=torch.float16
)

In [None]:
style_embs = dict()
for word, emb in all_added_emb.items():
    if word in word_to_type and word_to_type[word] == "style":
        style_embs[f"<{word}>"] = emb[list(emb.keys())[0]]

In [None]:
# Load Concepts
def load_learned_embed_in_clip(added_embeds, text_encoder, tokenizer, token=None):
  # loaded_learned_embeds = torch.load(learned_embeds_path, map_location="cpu")
  # trained_token = list(loaded_learned_embeds.keys())[0]
  trained_token = list(added_embeds.keys())[0]
  embeds = torch.tensor(added_embeds[trained_token])

  dtype = text_encoder.get_input_embeddings().weight.dtype
  embeds.to(dtype)

  token = token if token is not None else trained_token
  num_added_tokens = tokenizer.add_tokens(token)
  if num_added_tokens == 0:
    raise ValueError(f"The tokenizer already contains the token {token}. Please pass a different `token` that is not already in the tokenizer.")
  
  text_encoder.resize_token_embeddings(len(tokenizer))
  
  token_id = tokenizer.convert_tokens_to_ids(token)
  text_encoder.get_input_embeddings().weight.data[token_id] = embeds

In [None]:
def interpolate_linear(v1, v2, alpha):
  # alpha: 0 - 1, output v1 if alpha = 1
  return alpha * v1.clone().detach().requires_grad_(True) + (1 - alpha) * v2.clone().detach().requires_grad_(True)

In [None]:
def embed_interpolation(embeds, key1, key2, num):
    global text_encoder
    global tokenizer

    new_tokens = []
    # num pics outputed = num + 1
    alphas = [i / float(num) for i in range(num+1)]
    vecs = [interpolate_linear(embeds[key1], embeds[key2], a) for a in alphas]
    
    # itp_dict = dict()
    for i, v in enumerate(vecs):
        token_name = f"<{key1}-{key2}-{i}/{num}>"
        new_tokens.append(token_name)
        # itp_dict[token_name] = v.clone().detach().requires_grad_(True)
        load_learned_embed_in_clip({token_name: v}, text_encoder, tokenizer)
    return new_tokens

In [None]:
def gen_images_with_concepts(prompt_first, prompt_second, concepts):
  all_images = [] 
  prompts = [prompt_first + c + prompt_second for c in concepts]
  for prompt in prompts:
      images = pipe(prompt, num_images_per_prompt=1, num_inference_steps=50, guidance_scale=7.5)["sample"]
      all_images.extend(images)
  return all_images

In [None]:
def combine_images(images):
  widths, heights = zip(*(i.size for i in images))

  total_width = sum(widths)
  max_height = max(heights)

  new_im = Image.new('RGB', (total_width, max_height))

  x_offset = 0
  for im in images:
    new_im.paste(im, (x_offset,0))
    x_offset += im.size[0]

  return new_im

In [None]:
# k1 = "<m-geo>"
# k2 = '<kogecha>'
# n = 20

# torch.manual_seed(3407)
# torch.cuda.manual_seed(3407)

# itp_concepts = embed_interpolation(style_embs, k1, k2, n)

# pipe = StableDiffusionPipeline.from_pretrained(
#     pretrained_model_name_or_path,
#     revision="fp16",
#     torch_dtype=torch.float16,
#     text_encoder=text_encoder,
#     tokenizer=tokenizer,
# ).to("cuda")

# prompt1 = "a photo of a bear in the river in the style of "
# prompt2 = ""
# imgs = gen_images_with_concepts(prompt1, prompt2, itp_concepts)
# combine_images(imgs).save(f"{k1}_{k2}.jpg")

In [None]:
object_embs = dict()
for word, emb in all_added_emb.items():
    if word in word_to_type and word_to_type[word] == "object":
        object_embs[f"<{word}>"] = emb[list(emb.keys())[0]]

In [None]:
k1 = "<cat-toy-1>"
k2 = '<monster-toy>'
n = 20

torch.manual_seed(3407)
torch.cuda.manual_seed(3407)

itp_concepts = embed_interpolation(object_embs, k1, k2, n)

pipe = StableDiffusionPipeline.from_pretrained(
    pretrained_model_name_or_path,
    revision="fp16",
    torch_dtype=torch.float16,
    text_encoder=text_encoder,
    tokenizer=tokenizer,
).to("cuda")

prompt1 = "a photo of a "
prompt2 = ""
imgs = gen_images_with_concepts(prompt1, prompt2, itp_concepts)
combine_images(imgs).save(f"{k1}_{k2}.jpg")