In [1]:
import matplotlib.pyplot as plt
import torch
from diffusers.utils import export_to_video

from src.hooked_model.hooked_model_ltxvideo import HookedDiffusionModel
from src.hooked_model.hooks import AblateHook


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# model_name = "Lightricks/LTX-Video"
model_name = "a-r-r-o-w/LTX-Video-0.9.1-diffusers"


### How to register ablation hook and use it during the inference

In [3]:
pipe = HookedDiffusionModel.from_pretrained(model_name, torch_dtype=torch.bfloat16)
pipe.to("cuda")
pipe.enable_model_cpu_offload()

Loading checkpoint shards: 100%|██████████| 4/4 [00:09<00:00,  2.37s/it]
Loading pipeline components...: 100%|██████████| 5/5 [00:10<00:00,  2.00s/it]


In [4]:
pipe.transformer

LTXVideoTransformer3DModel(
  (proj_in): Linear(in_features=128, out_features=2048, bias=True)
  (time_embed): AdaLayerNormSingle(
    (emb): PixArtAlphaCombinedTimestepSizeEmbeddings(
      (time_proj): Timesteps()
      (timestep_embedder): TimestepEmbedding(
        (linear_1): Linear(in_features=256, out_features=2048, bias=True)
        (act): SiLU()
        (linear_2): Linear(in_features=2048, out_features=2048, bias=True)
      )
    )
    (silu): SiLU()
    (linear): Linear(in_features=2048, out_features=12288, bias=True)
  )
  (caption_projection): PixArtAlphaTextProjection(
    (linear_1): Linear(in_features=4096, out_features=2048, bias=True)
    (act_1): GELU(approximate='tanh')
    (linear_2): Linear(in_features=2048, out_features=2048, bias=True)
  )
  (rope): LTXVideoRotaryPosEmbed()
  (transformer_blocks): ModuleList(
    (0-27): 28 x LTXVideoTransformerBlock(
      (norm1): RMSNorm()
      (attn1): Attention(
        (norm_q): RMSNorm()
        (norm_k): RMSNorm()
    

In [17]:
import re

hookpoints = []
pattern = re.compile(r".*transformer_blocks\.(\d+).attn(\d+)$")
for n, m in pipe.transformer.named_modules():
    match = pattern.match(n)
    if match:
        hookpoints.append(n)
        print(n)


transformer_blocks.0.attn1
transformer_blocks.0.attn2
transformer_blocks.1.attn1
transformer_blocks.1.attn2
transformer_blocks.2.attn1
transformer_blocks.2.attn2
transformer_blocks.3.attn1
transformer_blocks.3.attn2
transformer_blocks.4.attn1
transformer_blocks.4.attn2
transformer_blocks.5.attn1
transformer_blocks.5.attn2
transformer_blocks.6.attn1
transformer_blocks.6.attn2
transformer_blocks.7.attn1
transformer_blocks.7.attn2
transformer_blocks.8.attn1
transformer_blocks.8.attn2
transformer_blocks.9.attn1
transformer_blocks.9.attn2
transformer_blocks.10.attn1
transformer_blocks.10.attn2
transformer_blocks.11.attn1
transformer_blocks.11.attn2
transformer_blocks.12.attn1
transformer_blocks.12.attn2
transformer_blocks.13.attn1
transformer_blocks.13.attn2
transformer_blocks.14.attn1
transformer_blocks.14.attn2
transformer_blocks.15.attn1
transformer_blocks.15.attn2
transformer_blocks.16.attn1
transformer_blocks.16.attn2
transformer_blocks.17.attn1
transformer_blocks.17.attn2
transformer_

In [15]:
# prompt = "Spiderman is surfing. Darth Vader is also surfing and following Spiderman"
# prompt = "Darth vader surfing in waves." 
# prompt = "A cinematic video of a figure resembling Darth Vader, wearing a black cape and helmet, skillfully surfing powerful ocean waves. The dark figure carves through the water on a sleek black surfboard, cape billowing dramatically in the wind. The sky is cloudy with a hint of sunset glow, casting an epic atmosphere. Water splashes intensely as the figure maintains balance with calm precision. The iconic dark armor glistens under the fading sunlight, evoking a sense of power and mystery."
# prompt = "A vibrant, hand-painted animation in the style of Van Gogh's Starry Night. A lone traveler walks down a winding cobblestone street in a swirling, dreamlike village. The sky glows with swirling patterns of deep blue, gold, and violet, while warm yellow streetlights flicker like dancing flames. The traveler, dressed in a flowing coat, pauses to gaze at the mesmerizing sky. The entire scene feels alive, with brushstroke-like textures moving dynamically."
# negative_prompt = "Photorealism, sharp details, flat colors, smooth shading, modern cityscapes, digital artifacts, unnatural lighting, robotic movement, sterile environments."
# prompt = "A futuristic cyberpunk city at night, bathed in glowing neon lights of magenta, cyan, and electric blue. Sleek motorcycles weave through the streets, their riders clad in high-tech armor. Towering skyscrapers are covered in holographic billboards flashing with digital ads. The air shimmers with faint rain, reflecting vibrant lights off the wet pavement. A mysterious figure in a trench coat walks down a narrow alley, their face illuminated by a flickering hologram."
# negative_prompt = "Pastel colors, daylight, rural scenery, low contrast, naturalistic textures, cartoonish characters, minimal detail."
# prompt = "A surfer rides a powerful wave in a realistic style. Detailed water splashes, natural lighting, and lifelike motion capture the energy of the ocean."
# negative_prompt = "Cartoonish elements, exaggerated motion, flat colors, unnatural lighting, digital artifacts, painterly textures."
# prompt = "A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage"
# prompt = "A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is bright and colorful, with exaggerated shading and smooth, bold outlines. The colors are vibrant, with warm yellows and oranges creating a cheerful, playful tone. The scene appears to be animated in a cartoon style."
# prompt = "A pixelated woman with long brown hair and light skin smiles at another pixelated woman with long blonde hair. The woman with brown hair wears a blocky black jacket and has a small, barely noticeable pixel marking her right cheek. The camera angle is a close-up, focused on the woman with brown hair's pixelated face. The lighting is simplified, with bright pixels forming a warm glow on her face. The scene appears to be in a retro 8-bit pixel art style."
# prompt = "A distorted, glitchy image of a woman with long brown hair and light skin shows her smiling at another woman with long blonde hair. The woman with brown hair wears a black jacket, and a faint digital artifact marks her right cheek like a small mole. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is chaotic, with flickering bands of color and digital noise disrupting the scene. Glitches ripple across the image, distorting faces and warping movement. The scene appears heavily corrupted in a glitch art style."
# prompt = "A woman with long brown hair and light skin smiles slowly at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair’s face. The lighting is warm and natural, with sunlight casting a soft glow. The motion is slowed down, making the smile unfold gradually, emphasizing subtle facial expressions and the shimmer of light in her eyes. The scene appears to be real-life footage captured in slow motion."
prompt = "A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair’s face. The lighting is warm and natural, with flickering pulses of light creating a strobe effect. The smile appears fragmented, jumping from one expression to the next with brief pauses in between. The scene appears to be real-life footage with a strobe lighting effect."

In [16]:
video = pipe(
    prompt=prompt,
    # negative_prompt=negative_prompt,
    width=768,
    height=512,
    num_frames=161,
    num_inference_steps=50,
).frames[0]
export_to_video(video, f"samples/ltxvideo/2_woman_talking/strobe.mp4", fps=24)

The following part of your input was truncated because `max_sequence_length` is set to  128 tokens: ['effect.']


100%|██████████| 50/50 [00:57<00:00,  1.16s/it]
It is recommended to use `export_to_video` with `imageio` and `imageio-ffmpeg` as a backend. 
These libraries are not present in your environment. Attempting to use legacy OpenCV backend to export video. 
Support for the OpenCV backend will be deprecated in a future Diffusers version


'samples/ltxvideo/2_woman_talking/strobe.mp4'

In [None]:
all_images = []

for i, hookpoint in enumerate(hookpoints):
    video_frames = pipe.run_with_hooks(
        prompt, 
        num_inference_steps=25, 
        num_frames=200, 
        position_hook_dict={hookpoint: AblateHook()},
        generator=torch.Generator(device="cuda").manual_seed(1),
    ).frames[0]

    video_path = export_to_video(video_frames, f"samples/ablation/{hookpoint}.mp4")

100%|██████████| 25/25 [00:41<00:00,  1.67s/it]
It is recommended to use `export_to_video` with `imageio` and `imageio-ffmpeg` as a backend. 
These libraries are not present in your environment. Attempting to use legacy OpenCV backend to export video. 
Support for the OpenCV backend will be deprecated in a future Diffusers version
100%|██████████| 25/25 [00:38<00:00,  1.54s/it]
It is recommended to use `export_to_video` with `imageio` and `imageio-ffmpeg` as a backend. 
These libraries are not present in your environment. Attempting to use legacy OpenCV backend to export video. 
Support for the OpenCV backend will be deprecated in a future Diffusers version
100%|██████████| 25/25 [00:39<00:00,  1.58s/it]
It is recommended to use `export_to_video` with `imageio` and `imageio-ffmpeg` as a backend. 
These libraries are not present in your environment. Attempting to use legacy OpenCV backend to export video. 
Support for the OpenCV backend will be deprecated in a future Diffusers version
1

In [None]:
def display_images(all_images, hookpoints, images_per_row=4):
    rows = len(all_images)
    fig, axes = plt.subplots(
        rows, images_per_row, figsize=(images_per_row * 3, rows * 3)
    )
    fig.subplots_adjust(hspace=0.5, wspace=0.5)  # Adjust space between rows and columns

    for i, row_images in enumerate(all_images[:rows]):  # Limit to the first `rows`
        for j, image in enumerate(
            row_images[:images_per_row]
        ):  # Limit to `images_per_row`
            ax = axes[i, j] if rows > 1 else axes[j]  # Handle single row case
            ax.imshow(image)
            ax.axis("off")  # Turn off axes for a cleaner look
            if j == 0:
                ax.set_title(hookpoints[i])
    plt.tight_layout()
    plt.show()


display_images(all_images, hookpoints)
