In [None]:
import matplotlib.pyplot as plt
import torch
from diffusers import DPMSolverMultistepScheduler
from diffusers.utils import export_to_video

from src.hooked_model.hooked_model_videofusion import HookedDiffusionModel
from src.hooked_model.hooks import AblateHook


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
model_name = "damo-vilab/text-to-video-ms-1.7b"


### How to register ablation hook and use it during the inference

In [5]:
pipe = HookedDiffusionModel.from_pretrained(model_name, torch_dtype=torch.float16, variant="fp16").to("cuda")
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
pipe.enable_model_cpu_offload()
# pipe.enable_vae_slicing()

Loading pipeline components...: 100%|██████████| 5/5 [00:00<00:00, 17.05it/s]


In [4]:
pipe.unet

UNet3DConditionModel(
  (conv_in): Conv2d(4, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (time_proj): Timesteps()
  (time_embedding): TimestepEmbedding(
    (linear_1): Linear(in_features=320, out_features=1280, bias=True)
    (act): SiLU()
    (linear_2): Linear(in_features=1280, out_features=1280, bias=True)
  )
  (transformer_in): TransformerTemporalModel(
    (norm): GroupNorm(32, 320, eps=1e-06, affine=True)
    (proj_in): Linear(in_features=320, out_features=512, bias=True)
    (transformer_blocks): ModuleList(
      (0): BasicTransformerBlock(
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn1): Attention(
          (to_q): Linear(in_features=512, out_features=512, bias=False)
          (to_k): Linear(in_features=512, out_features=512, bias=False)
          (to_v): Linear(in_features=512, out_features=512, bias=False)
          (to_out): ModuleList(
            (0): Linear(in_features=512, out_features=512, bias=True)
            

In [49]:
import re

hookpoints = []
pattern = re.compile(r".*\.attentions\.(\d+)$")
for n, m in pipe.unet.named_modules():
    match = pattern.match(n)
    if match:
        hookpoints.append(n)
        print(n)

hookpoints_temp = []
pattern = re.compile(r".*\.temp_attentions\.(\d+)$")
for n, m in pipe.unet.named_modules():
    match = pattern.match(n)
    if match:
        hookpoints.append(n)
        print(n)


down_blocks.0.attentions.0
down_blocks.0.attentions.1
down_blocks.1.attentions.0
down_blocks.1.attentions.1
down_blocks.2.attentions.0
down_blocks.2.attentions.1
up_blocks.1.attentions.0
up_blocks.1.attentions.1
up_blocks.1.attentions.2
up_blocks.2.attentions.0
up_blocks.2.attentions.1
up_blocks.2.attentions.2
up_blocks.3.attentions.0
up_blocks.3.attentions.1
up_blocks.3.attentions.2
mid_block.attentions.0
down_blocks.0.temp_attentions.0
down_blocks.0.temp_attentions.1
down_blocks.1.temp_attentions.0
down_blocks.1.temp_attentions.1
down_blocks.2.temp_attentions.0
down_blocks.2.temp_attentions.1
up_blocks.1.temp_attentions.0
up_blocks.1.temp_attentions.1
up_blocks.1.temp_attentions.2
up_blocks.2.temp_attentions.0
up_blocks.2.temp_attentions.1
up_blocks.2.temp_attentions.2
up_blocks.3.temp_attentions.0
up_blocks.3.temp_attentions.1
up_blocks.3.temp_attentions.2
mid_block.temp_attentions.0


In [6]:
# prompt = "Spiderman is surfing. Darth Vader is also surfing and following Spiderman"
prompt = "Darth vader surfing in waves." 
# prompt = "A fast zoom out on a squirrel eating a pinecone on a branch."
# neg_prompt = "zoom in"
# prompt = "First-person hiking view showing steps, hands moving branches, and a valley vista."
# prompt = "A fast moving red sports car driving on a the road next to a cliff."
# neg_prompt = "slow"
# prompt = "A slow zoom out on a pineapple on a plate."
# neg_prompt = "zoom in"
# prompt = "A fast zoom in on a red sports car parked next to a road."
# neg_prompt = "zoom out"
# prompt = "An astronaut riding a horse, realistic video."
neg_prompt = "glitchy animated unrealistic"
# prompt = "An astronaut riding a horse, in a cartoon."
# neg_prompt = "glitchy realistic"
# prompt = "An astronaut riding a horse, in a distorted video with glitchy waves, visual artifacts, pixel scrambling, and digital noise."
# neg_prompt = "coherent realistic photorealistic smooth"

In [8]:
video_frames = pipe(
    prompt, 
    negative_prompt=neg_prompt,
    num_inference_steps=25, 
    num_frames=30, 
    generator=torch.Generator(device="cuda").manual_seed(90),
).frames[0]
video_path = export_to_video(video_frames, f"samples/videofusion/darth_vader.mp4")
# len(video_frames)

100%|██████████| 25/25 [00:07<00:00,  3.45it/s]


In [9]:
video_path = export_to_video(video_frames, f"samples/videofusion/darth_vader_surfing/astronaut_strobe.mp4")

It is recommended to use `export_to_video` with `imageio` and `imageio-ffmpeg` as a backend. 
These libraries are not present in your environment. Attempting to use legacy OpenCV backend to export video. 
Support for the OpenCV backend will be deprecated in a future Diffusers version


In [11]:
all_images = []

for i, hookpoint in enumerate(hookpoints + hookpoints_temp):
    for i in range(10):
        video_frames = pipe.run_with_hooks(
            prompt, 
            num_inference_steps=50, 
            # num_frames=200, 
            position_hook_dict={hookpoint: AblateHook()},
            generator=torch.Generator(device="cuda").manual_seed(i),
        ).frames[0]

        video_path = export_to_video(video_frames, f"samples/ablation/squirrel_zoom_in/{hookpoint}_seed{i}.mp4")

  0%|          | 0/50 [00:00<?, ?it/s]

100%|██████████| 50/50 [00:06<00:00,  7.54it/s]
It is recommended to use `export_to_video` with `imageio` and `imageio-ffmpeg` as a backend. 
These libraries are not present in your environment. Attempting to use legacy OpenCV backend to export video. 
Support for the OpenCV backend will be deprecated in a future Diffusers version
100%|██████████| 50/50 [00:06<00:00,  7.76it/s]
It is recommended to use `export_to_video` with `imageio` and `imageio-ffmpeg` as a backend. 
These libraries are not present in your environment. Attempting to use legacy OpenCV backend to export video. 
Support for the OpenCV backend will be deprecated in a future Diffusers version
100%|██████████| 50/50 [00:06<00:00,  8.10it/s]
It is recommended to use `export_to_video` with `imageio` and `imageio-ffmpeg` as a backend. 
These libraries are not present in your environment. Attempting to use legacy OpenCV backend to export video. 
Support for the OpenCV backend will be deprecated in a future Diffusers version
1

In [None]:
def display_images(all_images, hookpoints, images_per_row=4):
    rows = len(all_images)
    fig, axes = plt.subplots(
        rows, images_per_row, figsize=(images_per_row * 3, rows * 3)
    )
    fig.subplots_adjust(hspace=0.5, wspace=0.5)  # Adjust space between rows and columns

    for i, row_images in enumerate(all_images[:rows]):  # Limit to the first `rows`
        for j, image in enumerate(
            row_images[:images_per_row]
        ):  # Limit to `images_per_row`
            ax = axes[i, j] if rows > 1 else axes[j]  # Handle single row case
            ax.imshow(image)
            ax.axis("off")  # Turn off axes for a cleaner look
            if j == 0:
                ax.set_title(hookpoints[i])
    plt.tight_layout()
    plt.show()


display_images(all_images, hookpoints)
