##Mount drive

In [None]:
# pick some other OUTPUT_DIR if you don't trust this notebook to write to your drive

import os
from google.colab import drive

drive.mount("/content/drive")

INPUT_DIR = "/content/drive/MyDrive/Wan2.1/inputs"
OUTPUT_DIR = "/content/drive/MyDrive/Wan2.1/outputs"
os.makedirs(INPUT_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

##Install libraries

In [None]:
!pip install git+https://github.com/huggingface/diffusers.git
!pip install -U bitsandbytes
!pip install ftfy

##Assemble pipeline

In [None]:
# for optimizations, see: https://huggingface.co/blog/video_gen

import torch
from diffusers import (
    BitsAndBytesConfig,
    WanImageToVideoPipeline,
    WanTransformer3DModel
)
from diffusers.hooks import apply_layerwise_casting
from diffusers.utils import export_to_video
from transformers import AutoTokenizer, T5EncoderModel

model_id = "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers"
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16
)
transformer = WanTransformer3DModel.from_pretrained(
    model_id,
    subfolder="transformer",
    quantization_config=quantization_config,
)
pipe = WanImageToVideoPipeline.from_pretrained(
    model_id,
    transformer=transformer,
)
pipe.enable_model_cpu_offload()

##Define functions

In [None]:
# for details, see https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/wan/pipeline_wan_i2v.py

from PIL import Image

def render(
    filename,
    prompt,
    seed=0,
    image=None,
    width=832,
    height=480,
    num_frames=81,
    num_inference_steps=30,
    guidance_scale=5.0,
    fps=16,
    extract_frames=False
):

    if os.path.exists(filename):
        return
    print(filename)

    ratio = width / height
    image_width, image_height = image.size
    image_ratio = image_width / image_height
    if image_ratio > ratio:
        w = int(image_height * ratio)
        c = (image_width - w) // 2
        image.crop((c, 0, image_width - c, image_height))
    elif image_ratio < ratio:
        h = int(image_width / ratio)
        c = (image_height - h) // 2
        image.crop((0, c, image_width, image_height - c))
    image.resize((width, height), Image.LANCZOS)

    video = pipe(
        image=image,
        prompt=prompt,
        generator=torch.Generator(device=pipe.device).manual_seed(seed),
        width=width,
        height=height,
        num_frames=num_frames,
        num_inference_steps=num_inference_steps,
        guidance_scale=guidance_scale,
    ).frames[0]
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    export_to_video(video, filename, fps=fps)

    if extract_frames:
        dirname = filename[:-4]
        os.makedirs(dirname, exist_ok=True)
        for i, frame in enumerate(video):
            frame.save(f"{dirname}/{i:08d}.png")


##Test

In [None]:
# bring your own images and prompts

src_dirname = "/content/drive/MyDrive/FLUX.1/outputs/dev/test/random_words_1280x720"
ids = sorted(
    [int(f.split(",")[-1][:-4]) for f in os.listdir(src_dirname) if f.endswith(".txt")]
)
prompts = {
    id: [f for f in os.listdir(src_dirname) if f.endswith(f",{id}.txt")][0]
    for id in ids
}
images = {
    id: [f for f in os.listdir(src_dirname) if f.endswith(f",{id}.png")][0]
    for id in ids
}
num_frames = 81
n = 3
for id in ids:
    for i in range(n):
        if i == 0:
            image = Image.open(f"{src_dirname}/{images[id]}")
        else:
            image = Image.open(f"{OUTPUT_DIR}/random_words/{id},{i - 1}/{num_frames - 1:08d}.png")
        render(
            filename=f"{OUTPUT_DIR}/random_words/{id},{i}.mp4",
            prompt=open(f"{src_dirname}/{prompts[id]}").read(),
            seed=id + i,
            image=image,
            num_frames=num_frames,
            extract_frames=True
        )

# this will use ~21 GB of VRAM and take ~150 seconds per step
# (with 30 steps, that's ~75 minutes for 5 seconds of video)
