<a href="https://colab.research.google.com/github/BalintKomjati/infinite-loop/blob/main/stable-diffusion-2-infinite-zoom-out/infinite_zoom_out.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#@markdown Check type of GPU and VRAM available.   
#@markdown The notebook should work with the Tesla T4 GPU + 16 GB VRAM available in the free colab tier.
!nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv,noheader

Tesla T4, 15360 MiB, 15109 MiB


In [2]:
#@markdown Install missing libraries
%pip install -qq transformers scipy ftfy accelerate
%pip install -qq --upgrade diffusers[torch]

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m50.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 KB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 KB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m43.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m604.0/604.0 KB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
#@markdown Load libraries
import PIL
from PIL import Image
import requests
from io import BytesIO
import numpy as np
import random
import cv2
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import torch
from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
from IPython.display import clear_output

In [4]:
#@markdown Define helper functions
def write_video(file_path, frames, fps, reversed = True):
    """
    Writes frames to an mp4 video file
    :param file_path: Path to output video, must end with .mp4
    :param frames: List of PIL.Image objects
    :param fps: Desired frame rate
    :param reversed: if order of images to be reversed (default = True)
    """
    if reversed == True:
      frames.reverse()

    w, h = frames[0].size
    fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v')
    #fourcc = cv2.VideoWriter_fourcc(*'avc1')
    writer = cv2.VideoWriter(file_path, fourcc, fps, (w, h))

    for frame in frames:
        np_frame = np.array(frame.convert('RGB'))
        cv_frame = cv2.cvtColor(np_frame, cv2.COLOR_RGB2BGR)
        writer.write(cv_frame)

    writer.release() 

def image_grid(imgs, rows, cols):
  assert len(imgs) == rows*cols

  w, h = imgs[0].size
  grid = Image.new('RGB', size=(cols*w, rows*h))
  grid_w, grid_h = grid.size

  for i, img in enumerate(imgs):
      grid.paste(img, box=(i%cols*w, i//cols*h))
  return grid

def shrink_and_paste_on_blank(current_image, STEPSIZE):

  height = current_image.height
  width = current_image.width

  #shrink down by STEPSIZE
  prev_image = current_image.resize((height-2*STEPSIZE,width-2*STEPSIZE))
  prev_image = prev_image.convert("RGBA")
  prev_image = np.array(prev_image)

  #create blank non-transparent image
  blank_image = np.array(current_image.convert("RGBA"))*0
  blank_image[:,:,3] = 1

  #paste shrinked onto blank
  blank_image[STEPSIZE:height-STEPSIZE,STEPSIZE:width-STEPSIZE,:] = prev_image
  prev_image = Image.fromarray(blank_image)

  return prev_image

In [5]:
#@markdown Download and set up diffusion pipeline
model_id = "stabilityai/stable-diffusion-2-inpainting" #@param
pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16, revision="fp16")
#pipe.set_use_memory_efficient_attention_xformers(True) #https://huggingface.co/docs/diffusers/optimization/fp16#memory-efficient-attention #couldnt make it run on colab
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
pipe = pipe.to("cuda")
def dummy(images, **kwargs):
    return images, False
pipe.safety_checker = dummy
pipe.enable_attention_slicing() #This is useful to save some memory in exchange for a small speed decrease.

g_cuda = torch.Generator(device='cuda')

Downloading (…)p16/model_index.json:   0%|          | 0.00/546 [00:00<?, ?B/s]

Fetching 13 files:   0%|          | 0/13 [00:00<?, ?it/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/681M [00:00<?, ?B/s]

Downloading (…)rocessor_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/826 [00:00<?, ?B/s]

Downloading (…)tokenizer/vocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

Downloading (…)cheduler_config.json:   0%|          | 0.00/308 [00:00<?, ?B/s]

Downloading (…)_encoder/config.json:   0%|          | 0.00/635 [00:00<?, ?B/s]

Downloading (…)tokenizer/merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/460 [00:00<?, ?B/s]

Downloading (…)_pytorch_model.bin";:   0%|          | 0.00/167M [00:00<?, ?B/s]

Downloading (…)_pytorch_model.bin";:   0%|          | 0.00/1.73G [00:00<?, ?B/s]

Downloading (…)5e80/vae/config.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

Downloading (…)e80/unet/config.json:   0%|          | 0.00/911 [00:00<?, ?B/s]

In [6]:
import torch
#import logging

## disable warnings
#logging.disable(logging.WARNING)  

## Import the CLIP artifacts 
from transformers import CLIPTextModel, CLIPTokenizer

## Initiating tokenizer and encoder.
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=torch.float16)
text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=torch.float16).to("cuda")

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/961k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/4.52k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

Some weights of the model checkpoint at openai/clip-vit-large-patch14 were not used when initializing CLIPTextModel: ['vision_model.embeddings.position_ids', 'vision_model.encoder.layers.0.self_attn.k_proj.weight', 'vision_model.encoder.layers.7.self_attn.q_proj.weight', 'vision_model.encoder.layers.6.mlp.fc1.bias', 'vision_model.encoder.layers.16.self_attn.q_proj.weight', 'vision_model.encoder.layers.19.self_attn.q_proj.bias', 'vision_model.encoder.layers.22.self_attn.q_proj.bias', 'vision_model.encoder.layers.3.mlp.fc2.bias', 'vision_model.encoder.layers.3.layer_norm2.weight', 'vision_model.encoder.layers.20.layer_norm1.weight', 'vision_model.encoder.layers.4.layer_norm1.bias', 'vision_model.encoder.layers.11.mlp.fc2.bias', 'vision_model.encoder.layers.1.self_attn.v_proj.bias', 'vision_model.encoder.layers.16.self_attn.out_proj.weight', 'vision_model.encoder.layers.19.mlp.fc1.bias', 'vision_model.encoder.layers.18.mlp.fc2.bias', 'vision_model.encoder.layers.12.layer_norm1.weight', 'v

In [7]:
prompt = ["a dog wearing hat"]
tok =tokenizer(prompt, padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt") 
print(tok.input_ids.shape)
tok

torch.Size([1, 77])


{'input_ids': tensor([[49406,   320,  1929,  3309,  3801, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0]])}

In [8]:
emb = text_encoder(tok.input_ids.to("cuda"))[0].half()
print(f"Shape of embedding : {emb.shape}")
emb

Shape of embedding : torch.Size([1, 77, 768])


tensor([[[-0.3884,  0.0229, -0.0523,  ..., -0.4902, -0.3066,  0.0674],
         [ 0.0292, -1.3242,  0.3076,  ..., -0.5254,  0.9766,  0.6655],
         [-1.5928,  0.5063,  1.0771,  ..., -1.5273, -0.8428,  0.1616],
         ...,
         [-1.4707,  0.3098,  1.1670,  ...,  0.3755,  0.5366, -1.5039],
         [-1.4688,  0.2983,  1.1777,  ...,  0.3757,  0.5410, -1.5010],
         [-1.4414,  0.3127,  1.1963,  ...,  0.3516,  0.5405, -1.5498]]],
       device='cuda:0', dtype=torch.float16, grad_fn=<NativeLayerNormBackward0>)

In [9]:
type(emb)

torch.Tensor

In [14]:
#@markdown find a good init image:

orig_prompt = "A dream of a distant galaxy, concept art, matte painting, HQ, 4k" #@param
orig_negative_prompt = "blur, blurred, frame, ugly, low quality" #@param

num_images = 1 #@param
seed = 777777 #@param
num_inference_steps = 30 #@param
guidance_scale = 7 #@param
height = 1024 #@param
width = height #@param


prompt = [orig_prompt] * num_images
negative_prompt = [orig_negative_prompt] * num_images

current_image = PIL.Image.new(mode="RGBA", size=(height, width))
mask_image = np.array(current_image)[:,:,3] 
mask_image = Image.fromarray(255-mask_image).convert("RGB")
current_image = current_image.convert("RGB")

init_images =  pipe(#prompt=prompt,
                    prompt_embeds = emb,
                    negative_prompt=negative_prompt,
                    image=current_image,
                    guidance_scale = guidance_scale,
                    height = height,
                    width = width, 
                    generator = g_cuda.manual_seed(seed),
                    mask_image=mask_image, 
                    num_inference_steps=num_inference_steps)[0]


image_grid(init_images, rows=1, cols=num_images)

TypeError: ignored

In [None]:
image_num_selected = 0 #@param
STEPSIZE = 256 #@param
NUMFRAMES = 10 #@param
NUMINTERPOLFRAMES = 23 #@param 

#ideally STEPSIZE should be a power of 2
#ideally STEPSIZE / (NUMINTERPOLFRAMES + 1) / 2 should be even

current_image = init_images[0]
all_frames = []
all_frames.append(current_image)

for i in range(NUMFRAMES):
  print('Generating image: ' + str(i+1) + ' / ' + str(NUMFRAMES))

  prev_image_fix = current_image

  prev_image = shrink_and_paste_on_blank(current_image, STEPSIZE)

  current_image = prev_image

  #create mask (black image with white STEPSIZE width edges)
  mask_image = np.array(current_image)[:,:,3] 
  mask_image = Image.fromarray(255-mask_image).convert("RGB")

  #inpaint
  current_image = current_image.convert("RGB")
  images = pipe(prompt=prompt,
                negative_prompt=negative_prompt,
                image=current_image,
                guidance_scale = guidance_scale,
                height = height,
                width = width, 
                #generator = g_cuda.manual_seed(seed), #this makes the whole thing deterministic
                mask_image=mask_image, 
                num_inference_steps=num_inference_steps)[0]
  current_image = images[0]
  current_image.paste(prev_image, mask=prev_image)

  #zoom and crop to create interpolated frames between 2 inpainted frames
  scalefactor = ((height - 2*STEPSIZE) / height )**(1/(NUMINTERPOLFRAMES+1))
  for j in range(NUMINTERPOLFRAMES):
    interpol_image = current_image
    pix = round((1-(scalefactor**(NUMINTERPOLFRAMES-j)))*height / 2)
    interpol_image = interpol_image.crop((pix,
                                          pix,
                                          width - pix,
                                          height - pix))

    interpol_image = interpol_image.resize((height, width))

    #fix blur in the middle:
    pix2 = round((height - (height - 2*STEPSIZE)*height/(height-2*pix))/2)
    prev_image_fix_crop = shrink_and_paste_on_blank(prev_image_fix, pix2)
    interpol_image.paste(prev_image_fix_crop, mask = prev_image_fix_crop)

    all_frames.append(interpol_image)

  all_frames.append(current_image)
  clear_output(wait=True)

Generating image: 10 / 10


  0%|          | 0/30 [00:00<?, ?it/s]

In [None]:
#@markdown Check some (equally spaced) frames of the video 
#@markdown (Pretty slow for some reason)
num_of_frames_to_chk = 4 #@param
num_of_frames_to_chk = min(num_of_frames_to_chk, len(all_frames))
idx = np.round(np.linspace(0, len(all_frames) - 1, num_of_frames_to_chk)).astype(int)
image_grid(list(all_frames[i] for i in idx), rows = 1, cols = num_of_frames_to_chk)

In [None]:
write_video("infinite_zoom_out.mp4", all_frames, 24)

TODO

 - Simplify interpolation logic (e.g. scalefactor * pix)
 - Refactor: "fix blur in the middle" == 1st steps in the outer loop
 - Tweak seed (fixed for some frames then shift + det vs nondet)
 - Tweak prompts (drift between prompts)
 - 1024 res


In [None]:
#frame_one = all_frames[0]
#frame_one.save("all_frames.gif", format="GIF", append_images=all_frames[1:], save_all=True, duration=250, loop=0)