<a href="https://colab.research.google.com/github/steinhaug/stable-diffusion/blob/main/smooth_infinite_zoom.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Smooth Infinite Zoom v1.1

A user friendly colab notebook to generate infinite loop videos in minutes (works on free colab plan)

#### Examples and latest version available here:  
[![Open in Colab](https://img.shields.io/badge/steinhaug-Open%20in%20Colab-blue?logo=google-colab)](https://colab.research.google.com/github/steinhaug/stable-diffusion/blob/main/smooth_infinite_zoom.ipynb)

#### Derived work from: 
[![Open in Colab](https://img.shields.io/badge/v8hid-Open%20in%20Colab-blue?logo=google-colab)](https://colab.research.google.com/github/v8hid/infinite-zoom-stable-diffusion/blob/main/smooth_infinite_zoom.ipynb)



In [None]:
#@markdown CHECK TYPE OF GPU AND VRAM AVAILABLE   <br>
#@markdown The notebook should work fine with the Tesla T4 GPU + 16 GB VRAM available (but to a limited extend) in the free colab plan. <br>
#@markdown If this drops an error you need go: Runtime / Change runtime type and pick Hardvare accelarator = GPU and GPU class = Standard.
!nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv,noheader

Tesla T4, 15360 MiB, 9308 MiB


In [None]:
master_prompt = "a naked woman, blue and green hair, short haircut, pale skin, slim body, high detailed skin, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3"
master_negati = ""
prompts={
    0: "extreme close-up of face of " + master_prompt + ", background is city ruins",
    5: "background is city ruins"
}
negative_prompts={
    0: "" + master_negati,
    5: "" + master_negati
}


## 1.0 Install and setup cells

1.1 -> 1-4 must run.

In [None]:
#@markdown 1.1 Setup static settings
from google.colab import drive
from types import SimpleNamespace
import math
def progress(progress):
    if progress>0:
        prg = math.ceil(progress / 4)
    else:
        prg = progress
    br="Progress : " '|'+'█' * prg + ' ' * (25-prg)+'| ' +str(prg*4)+ "%"
    return br

def Static():
    mount_google_drive = True #@param {type:"boolean"}
    if (mount_google_drive) : 
      drive.mount('/content/gdrive')
    output_path = "/content/drive/MyDrive/infinite-zoom" #@param {type:"string"}
    return locals()

static = Static()
static = SimpleNamespace(**static)

In [None]:
#@markdown 1.2 INSTALL DEPENDENCIES.
from IPython.display import clear_output

print("1/3: Install missing libraries")
%pip install -qq transformers scipy ftfy accelerate
%pip install -qq --upgrade diffusers[torch]

print("2/3: Load necessary libraries")
import PIL
from PIL import Image
import requests
from io import BytesIO
import numpy as np
import random
import cv2
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import torch
from diffusers import StableDiffusionInpaintPipeline, DPMSolverMultistepScheduler
from IPython.display import clear_output
from datetime import datetime
if not os.path.exists(static.output_path):
    os.makedirs(static.output_path)
print("3/3: Define helper functions")
def write_video(file_path, frames, fps, reversed = True, start_frame_dupe_amount = 15, last_frame_dupe_amount = 30):
  """
  Writes frames to an mp4 video file
  :param file_path: Path to output video, must end with .mp4
  :param frames: List of PIL.Image objects
  :param fps: Desired frame rate
  :param reversed: if order of images to be reversed (default = True)
  """
  if reversed == True:
    frames.reverse()

  w, h = frames[0].size
  fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v')
  #fourcc = cv2.VideoWriter_fourcc('h', '2', '6', '4')
  #fourcc = cv2.VideoWriter_fourcc(*'avc1')
  writer = cv2.VideoWriter(file_path, fourcc, fps, (w, h))

## start frame duplicated 
  for x in range(start_frame_dupe_amount):  
    np_frame = np.array(frames[0].convert('RGB'))
    cv_frame = cv2.cvtColor(np_frame, cv2.COLOR_RGB2BGR)
    writer.write(cv_frame)
  
  for frame in frames:
      np_frame = np.array(frame.convert('RGB'))
      cv_frame = cv2.cvtColor(np_frame, cv2.COLOR_RGB2BGR)
      writer.write(cv_frame)

## last frame duplicated 
  for x in range(last_frame_dupe_amount):  
    np_frame = np.array(frames[len(frames) - 1].convert('RGB'))
    cv_frame = cv2.cvtColor(np_frame, cv2.COLOR_RGB2BGR)
    writer.write(cv_frame)
    
  writer.release() 

def image_grid(imgs, rows, cols):
  assert len(imgs) == rows*cols

  w, h = imgs[0].size
  grid = Image.new('RGB', size=(cols*w, rows*h))
  grid_w, grid_h = grid.size

  for i, img in enumerate(imgs):
      grid.paste(img, box=(i%cols*w, i//cols*h))
  return grid

def shrink_and_paste_on_blank(current_image, mask_width):
  """
  Decreases size of current_image by mask_width pixels from each side,
  then adds a mask_width width transparent frame, 
  so that the image the function returns is the same size as the input. 
  :param current_image: input image to transform
  :param mask_width: width in pixels to shrink from each side
  """

  height = current_image.height
  width = current_image.width

  #shrink down by mask_width
  prev_image = current_image.resize((height-2*mask_width,width-2*mask_width))
  prev_image = prev_image.convert("RGBA")
  prev_image = np.array(prev_image)

  #create blank non-transparent image
  blank_image = np.array(current_image.convert("RGBA"))*0
  blank_image[:,:,3] = 1

  #paste shrinked onto blank
  blank_image[mask_width:height-mask_width,mask_width:width-mask_width,:] = prev_image
  prev_image = Image.fromarray(blank_image)

  return prev_image
  
def load_img(address, res=(512, 512)):
    if address.startswith('http://') or address.startswith('https://'):
        image = Image.open(requests.get(address, stream=True).raw)
    else:
        image = Image.open(address)
    image = image.convert('RGB')
    image = image.resize(res, resample=Image.LANCZOS)
    return image

clear_output(); print('[1;32mDone! ✓')

In [None]:
#@markdown 1.3 SET UP DIFFUSION PIPELINE WITH INPAINT MODEL<br><br>
#@markdown Select inpainting model:
model_id = 'Uminosachi/revAnimated_v121Inp-inpainting' #@param ["saik0s/realistic_vision_inpainting", "Uminosachi/revAnimated_v121Inp-inpainting", "stabilityai/stable-diffusion-2-inpainting", "runwayml/stable-diffusion-inpainting", "ImNoOne/f222-inpainting-diffusers","parlance/dreamlike-diffusion-1.0-inpainting","ghunkins/stable-diffusion-liberty-inpainting"] {allow-input: true}
pipe = StableDiffusionInpaintPipeline.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
)
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
pipe = pipe.to("cuda")
def dummy(images, **kwargs):
    return images, str(False)
#pipe.safety_checker = dummy
pipe.safety_checker = None
pipe.enable_attention_slicing() #This is useful to save some memory in exchange for a small speed decrease.

g_cuda = torch.Generator(device='cuda')  
clear_output(); print('[1;32mDone! ✓')

In [None]:
#@markdown 1.4 DIFFUSION SETTINGS: <br>
#@markdown (Image output of this block will be the last image of the video)

prompt = prompts[0]

#@markdown Appended to all possible negative prompts in array above:
negative_prompt = "montage, frame, text, ugly, blur" #@param {type:"string"}

#@markdown Number of initial example images to generate:
num_init_images = 1 #@param
#@markdown Random seed (arbitrary input to make the initial image generation deterministic):
seed = 9999 #@param
#@markdown  The number of denoising steps (Higher number usually lead to a higher quality image at the expense of slower inference):
num_inference_steps = 30 #@param
#@markdown Guidance scale defines how closely generated images to be linked to the text prompt:
guidance_scale = 6 #@param
#@markdown Heigth (and width) of the images in pixels (= resolution of the video generated in the next block, has to be divisible with 8):
height = 512 #@param
width = height 
#@markdown Since the model was trained on 512 images increasing the resolution to e.g. 1024 will
#@markdown drastically reduce its imagination, so the video will vary a lot less compared to 512

current_image = PIL.Image.new(mode="RGBA", size=(height, width))
mask_image = np.array(current_image)[:,:,3] 
mask_image = Image.fromarray(255-mask_image).convert("RGB")
current_image = current_image.convert("RGB")

print( prompt*num_init_images )
print( negative_prompt*num_init_images )

init_images =  pipe(prompt=prompt*num_init_images,
                    negative_prompt=negative_prompt*num_init_images,
                    image=current_image,
                    guidance_scale = guidance_scale,
                    height = height,
                    width = width, 
                    generator = g_cuda.manual_seed(seed),
                    mask_image=mask_image, 
                    num_inference_steps=num_inference_steps)[0]

image_grid(init_images, rows=1, cols=num_init_images)

#@markdown We shrink the init image from the previous block and outpaint its outer frame using the same concept defined above (e.g. prompt, negative prompt, inference steps) but with a different seed. To generate an "inifinte zoom" video this is repeated **num_outpainting_steps** times and then rendered in reversed order.<br><br>
#@markdown To keep the outpainted part coherent and full of new content its width has to be relatively large (e.g. **mask_width** = 128 pixels if resolution is 512*512).<br><br>
#@markdown This on the other hand means that the generated video would be too fast and aestetically unpleasant. To slow down and smoothen the video we generate num_interpol_frames additional images between outpainted images using simple "interpolation".<br><br>
#@markdown Notes:<br><ul><li>Length of the video is proportional to num_outpainting_steps * num_interpol_frames.</li>
#@markdown <li>The time to generate the video is proportional to num_outpainting_steps.</li><li>On a T4 GPU it takes about ~7 minutes to generate the video of width = 512, num_inference_steps = 20, num_outpainting_steps = 100. With fps = 24 and num_interpol_frames = 24 the video will be about 1:40 minutes long.</li></ul>


## 2.0 Generate Video and keyframe cells

In [None]:
#@markdown 2.1 GENERATE VIDEO - REQUIRES USER FEEDBACK WHILE RENDERING:<br>
#@markdown For each keyframe you must select 1, 2 or 3 of generated images. If not the keyframe is re-rendered with the input as negative prompt until you select 1, 2 or 3.
skip_manual_guideance = False  #@param {type:"boolean"}

from PIL import Image, ImageOps
import os.path
from IPython.display import display
#zoom_name = "ladyzoom" #@param {type:"string"}
#zoom_path = "/content/gdrive/MyDrive/infinite-zoom" #@param {type:"string"}

#@markdown Pick an initial image from the previous block for your video: <br> (This is only relevant if num_init_images > 1)
init_image_selected = 1 #@param
if num_init_images == 1:
  init_image_selected = 0
else:
  init_image_selected = init_image_selected - 1
custom_init_image = False #@param {type:"boolean"}
init_image_address = "/content/drive/MyDrive/infinite-zoom/image.png"#@param {type:"string"}
#@markdown Number of outpainting steps:
num_outpainting_steps = 10 #@param
#@markdown Width of the border in pixels to be outpainted during each step:
#@markdown <br> (make sure: mask_width < image resolution / 2)
mask_width = 128 #@param
#@markdown Number of images to be interpolated between each outpainting step:
num_interpol_frames = 30 #@param 


In [None]:
#@markdown 2.2 GENERATE VIDEO - REQUIRES USER FEEDBACK WHILE RENDERING:

if(custom_init_image):
  current_image = load_img(init_image_address,(width,height))
else :
  current_image = init_images[init_image_selected]
all_frames = []
all_frames.append(current_image)

log_all = []

tmp_i = 0
for i in range(num_outpainting_steps):
  print('Generating image: ' + str(i+1) + ' / ' + str(num_outpainting_steps))

  prev_image_fix = current_image

  prev_image = shrink_and_paste_on_blank(current_image, mask_width)

  current_image = prev_image
  log_current_image = current_image

  #create mask (black image with white mask_width width edges)
  mask_image = np.array(current_image)[:,:,3] 
  mask_image = Image.fromarray(255-mask_image).convert("RGB")

  curr_key = max(k for k in prompts.keys() if k <= i)
  curr_prompt = prompts[curr_key]
  curr_negative_prompt = negative_prompt

  if negative_prompts.__contains__(curr_key):
    curr_negative_prompt = negative_prompts[curr_key] + negative_prompt

  # new inpaint logic
#  image_filename = zoom_path + '/' + zoom_name + '/' + str(i) + '.png'
#  if os.path.isfile(image_filename):
#    current_image = Image.open(image_filename)
#  else:
#    #inpainting step
#    current_image = current_image.convert("RGB")
#    images = pipe(prompt=curr_prompt,
#                  negative_prompt=curr_negative_prompt,
#                  image=current_image,
#                  guidance_scale = guidance_scale,
#                  height = height,
#                  width = width, 
#                  #this can make the whole thing deterministic but the output less exciting
#                  #generator = g_cuda.manual_seed(seed), 
#                  mask_image=mask_image, 
#                  num_inference_steps=num_inference_steps)[0]
#    current_image = images[0]

  this_negative_prompt = curr_negative_prompt
  repaint = True
  while repaint:
    images1 = pipe(prompt=curr_prompt,
                  negative_prompt=this_negative_prompt,
                  image=current_image,
                  guidance_scale = guidance_scale,
                  height = height,
                  width = width, 
                  mask_image=mask_image, 
                  num_inference_steps=num_inference_steps)[0]
    if skip_manual_guideance:
      current_image = images1[0]
      break

    images2 = pipe(prompt=curr_prompt,
                  negative_prompt=this_negative_prompt,
                  image=current_image,
                  guidance_scale = guidance_scale,
                  height = height,
                  width = width, 
                  generator = g_cuda.manual_seed(666), 
                  mask_image=mask_image, 
                  num_inference_steps=num_inference_steps)[0]
    images3 = pipe(prompt=curr_prompt,
                  negative_prompt=this_negative_prompt,
                  image=current_image,
                  guidance_scale = guidance_scale,
                  height = height,
                  width = width,
                  generator = g_cuda.manual_seed(seed), 
                  mask_image=mask_image, 
                  num_inference_steps=num_inference_steps)[0]

    clear_output(wait=True)
    current_image_with_border = ImageOps.expand(current_image,border=1,fill='black')
    display(current_image_with_border.resize((256, 256)))

    fig, ((ax1, ax2, ax3)) = plt.subplots(1, 3, figsize=(12, 4), gridspec_kw={'hspace': 0, 'wspace': 0}, sharex=True, sharey=True)
    ax1.imshow(images1[0]); ax1.axis('off'); ax1.set_title(1)
    ax2.imshow(images2[0]); ax2.axis('off'); ax2.set_title(2)
    ax3.imshow(images3[0]); ax3.axis('off'); ax3.set_title(3)
    plt.show()

    print(f'Negative prompt: {this_negative_prompt}')
    what_image = input(f'Frame {tmp_i}: 1, 2 or 3? Or negative prompt: ')
    if what_image == '1':
      repaint = False
      current_image = images1[0]
    elif what_image == '2':
      repaint = False
      current_image = images2[0]
    elif what_image == '3':
      repaint = False
      current_image = images3[0]
    else:
      this_negative_prompt = what_image
      print(f'Prompt: {curr_prompt}')
      new_prompt = input(f'New prompt: ')
      if len(new_prompt) > 3:
        curr_prompt = new_prompt

  current_image.paste(prev_image, mask=prev_image)

  interpol_image_done = True;

  #interpolation steps bewteen 2 inpainted images (=sequential zoom and crop)
  for j in range(num_interpol_frames - 1):
    interpol_image = current_image
    interpol_width = round(
        (1- ( 1-2*mask_width/height )**( 1-(j+1)/num_interpol_frames ) )*height/2 
        )
    interpol_image = interpol_image.crop((interpol_width,
                                          interpol_width,
                                          width - interpol_width,
                                          height - interpol_width))

    if interpol_image_done:
      interpol_image_raw1 = interpol_image

    interpol_image = interpol_image.resize((height, width))

    #paste the higher resolution previous image in the middle to avoid drop in quality caused by zooming
    interpol_width2 = round(
        ( 1 - (height-2*mask_width) / (height-2*interpol_width) ) / 2*height
    )
    prev_image_fix_crop = shrink_and_paste_on_blank(prev_image_fix, interpol_width2)
    interpol_image.paste(prev_image_fix_crop, mask = prev_image_fix_crop)

    if interpol_image_done:
      interpol_image_done = False
      interpol_image_raw2 = prev_image_fix_crop
      interpol_image_raw3 = interpol_image

    all_frames.append(interpol_image)

  log_item = [
      log_current_image,
      current_image,
      interpol_image_raw1,
      interpol_image_raw2,
      interpol_image,
      curr_prompt,
      curr_negative_prompt
  ]
  log_all.append(log_item)

  all_frames.append(current_image)
  clear_output(wait=True)
  interpol_image.show()
  
  tmp_i += 1
  #if tmp_i == 8:
  #  print('Temp break, tmp_i')
  #  break


### Debugging cells (no need to run theese)

In [None]:
#@markdown Debug prompts and negative prompts
from IPython.display import display
import matplotlib.pyplot as plt
from PIL import Image, ImageOps, ImageDraw

i=0
for img1, img2, image1, image2, image3, prompt, negative_prompt in log_all:

    #draw = ImageDraw.Draw(image)
    #draw.rectangle(((192, 192), (128, 128)), outline=(255,0,0))
    #img_with_border = ImageOps.expand(image,border=1,fill='red')
    w1, h1 = img1.size; w2, h2 = img2.size; w3, h3 = image1.size; w4, h4 = image2.size; w5, h5 = image3.size
    fig, ((ax1, ax2, ax3, ax4, ax5)) = plt.subplots(1, 5, figsize=(15, 3), gridspec_kw={'hspace': 0, 'wspace': 0}, sharex=True, sharey=True)
    ax1.imshow(img1.resize((256, 256))); ax1.axis('off'); ax1.set_title(str(w1) + ' x ' + str(h1))
    ax2.imshow(img2.resize((256, 256))); ax2.axis('off'); ax2.set_title(str(w2) + ' x ' + str(h2))
    ax3.imshow(image1.resize((131, 131))); ax3.axis('off'); ax3.set_title(str(w3) + ' x ' + str(h3))
    ax4.imshow(image2.resize((256, 256))); ax4.axis('off'); ax4.set_title(str(w4) + ' x ' + str(h4))
    ax5.imshow(image3.resize((256, 256))); ax5.axis('off'); ax5.set_title(str(w5) + ' x ' + str(h5))
    plt.show()
    #print(str(w1) + ' x ' + str(h1) + '. ' + str(w2) + ' x ' + str(h2) + '. ' + str(w3) + ' x ' + str(h3) + '. ' + str(w4) + ' x ' + str(h4) +  '. ' + str(w5) + ' x ' + str(h5) + '.')
    print(f'Frame {i}')
    print(f'Prompt: {prompt}')
    print(f'Negative prompt: {negative_prompt}')

    #if i >= 1:
    #    break
    i += 1


In [None]:
#@markdown Debug - Re-render frames
frame_to_rerender = 4 #@param {type:"number"}
pre_negpro = ""

i=0
for img1, img2, image1, image2, image3, prompt, negative_prompt in log_all:
    if i == frame_to_rerender:

        if negative_prompt == 'negative_prompt':
            negative_prompt = ''

        w1, h1 = img1.size; w2, h2 = img2.size; w3, h3 = image1.size; w4, h4 = image2.size; w5, h5 = image3.size
        fig, ((ax1, ax2, ax3, ax4, ax5)) = plt.subplots(1, 5, figsize=(15, 3), gridspec_kw={'hspace': 0, 'wspace': 0}, sharex=True, sharey=True)
        ax1.imshow(img1.resize((256, 256))); ax1.axis('off'); ax1.set_title(str(w1) + ' x ' + str(h1))
        ax2.imshow(img2.resize((256, 256))); ax2.axis('off'); ax2.set_title(str(w2) + ' x ' + str(h2))
        ax3.imshow(image1.resize((131, 131))); ax3.axis('off'); ax3.set_title(str(w3) + ' x ' + str(h3))
        ax4.imshow(image2.resize((256, 256))); ax4.axis('off'); ax4.set_title(str(w4) + ' x ' + str(h4))
        ax5.imshow(image3.resize((256, 256))); ax5.axis('off'); ax5.set_title(str(w5) + ' x ' + str(h5))
        plt.show()

        images1 = pipe(prompt=prompt,
                    negative_prompt=pre_negpro + negative_prompt,
                    image=img1,
                    guidance_scale = guidance_scale,
                    height = height,
                    width = width, 
                    mask_image=mask_image, 
                    num_inference_steps=num_inference_steps)[0]
        images2 = pipe(prompt=prompt,
                    negative_prompt=pre_negpro + negative_prompt,
                    image=img1,
                    guidance_scale = guidance_scale,
                    height = height,
                    width = width, 
                    mask_image=mask_image, 
                    num_inference_steps=num_inference_steps)[0]
        images3 = pipe(prompt=prompt,
                    negative_prompt=pre_negpro + negative_prompt,
                    image=img1,
                    guidance_scale = guidance_scale,
                    height = height,
                    width = width, 
                    mask_image=mask_image, 
                    num_inference_steps=num_inference_steps)[0]

        fig, ((ax1, ax2, ax3)) = plt.subplots(1, 3, figsize=(15, 5), gridspec_kw={'hspace': 0, 'wspace': 0}, sharex=True, sharey=True)

        ax1.imshow(images1[0]); ax1.axis('off'); ax1.set_title(zoom_name + '/' + str(i) + '.png A')
        ax2.imshow(images2[0]); ax2.axis('off'); ax2.set_title(zoom_name + '/' + str(i) + '.png B')
        ax3.imshow(images3[0]); ax3.axis('off'); ax3.set_title(zoom_name + '/' + str(i) + '.png C')
        plt.show()

    i += 1


## 3.0 Finalize Video, generate and save your video

In [None]:
#@markdown 3.1 CREATE VIDEO, RENDER MP4 FRAMES.
video_file_name = "manual_infinite_zoom" #@param {type:"string"}
#@markdown frames per second:
fps = 30 #@param
now = datetime.now()
date_time = now.strftime("%m-%d-%Y_%H-%M-%S")
#@markdown Duplicates the first and last frames, use to add a delay before animation based on playback fps (15 = 0.5 seconds @ 30fps)
start_frame_dupe_amount = 15 #@param
last_frame_dupe_amount = 15 #@param
write_video(os.path.join(static.output_path, video_file_name + "_out_"+date_time+".mp4"), all_frames, fps, False, start_frame_dupe_amount, last_frame_dupe_amount)
write_video(os.path.join(static.output_path, video_file_name + "_in_"+date_time+".mp4"), all_frames, fps, True, start_frame_dupe_amount, last_frame_dupe_amount)
#@markdown Once this block is finished, download your video from the "Files" folder menu on the left (output_path).
print(f'Video saved in: {output_path}')


In [None]:
#@markdown 3.2 CHECK SOME (equally spaced) FRAMES OF THE VIDEO:
num_of_frames_to_chk = 4 #@param
num_of_frames_to_chk = min(num_of_frames_to_chk, len(all_frames))
idx = np.round(np.linspace(0, len(all_frames) - 1, num_of_frames_to_chk)).astype(int)
image_grid(list(all_frames[i] for i in idx), rows = 1, cols = num_of_frames_to_chk)
#@markdown (This is relatively slow but still faster in some cases then to download the complete video in the previous block)
