# **Basic FramePack Workflow for Image to Video**
- Compute units are required to run this notebook.
- Generating a 5 minute video with the default settings took roughly 12 minutes on the L4 GPU.
- You can enable `use_teacache` for faster generations. This optimization reuses computations and doesn't reduce the quality of cartoon characters and backgrounds as much as it does to realistic scenes.

In [None]:
# @title Setup Environment
from IPython.display import clear_output
%cd /content
!git clone https://github.com/Isi-dev/ComfyUI
clear_output()
%cd /content/ComfyUI/custom_nodes
!git clone https://github.com/Isi-dev/ComfyUI_FramePackWrapper
!git clone https://github.com/Isi-dev/ComfyUI_KJNodes
clear_output()
%cd /content/ComfyUI/custom_nodes/ComfyUI_FramePackWrapper
!pip install -r requirements.txt
clear_output()

%cd /content/ComfyUI/custom_nodes/ComfyUI_KJNodes
!pip install -r requirements.txt
clear_output()
%cd /content/ComfyUI

import subprocess
import sys

def install_pip_packages():
    packages = [
        'torchsde',
        'av',
        'diffusers',
        # 'transformers',
        'xformers',
        'accelerate',
        # 'omegaconf',
        # 'tqdm',
        # 'librosa',
        'triton',
        'sageattention',
        'einops'
    ]

    for package in packages:
        try:
            # Run pip install silently (using -q)
            subprocess.run(
                [sys.executable, '-m', 'pip', 'install', '-q', package],
                check=True,
                capture_output=True
            )
            print(f"✓ {package} installed")
        except subprocess.CalledProcessError as e:
            print(f"✗ Error installing {package}: {e.stderr.decode().strip() or 'Unknown error'}")

def install_apt_packages():
    packages = ['aria2']

    try:
        # Run apt install silently (using -qq)
        subprocess.run(
            ['apt-get', '-y', 'install', '-qq'] + packages,
            check=True,
            capture_output=True
        )
        print("✓ apt packages installed")
    except subprocess.CalledProcessError as e:
        print(f"✗ Error installing apt packages: {e.stderr.decode().strip() or 'Unknown error'}")

# Run installations
print("Installing pip packages...")
install_pip_packages()
clear_output()  # Clear the pip installation output

print("Installing apt packages...")
install_apt_packages()
clear_output()  # Clear the apt installation output

print("Installation completed with status:")
print("- All pip packages installed successfully" if '✗' not in install_pip_packages.__code__.co_consts else "- Some pip packages had issues")
print("- apt packages installed successfully" if '✗' not in install_apt_packages.__code__.co_consts else "- apt packages had issues")

import torch
import numpy as np
from PIL import Image
import gc
import os
import torch
import numpy as np
from PIL import Image
import random
import imageio
from google.colab import files
from IPython.display import display, HTML, Image as IPImage
sys.path.insert(0, '/content/ComfyUI')

from comfy import model_management

from nodes import (
    VAELoader,
    CLIPTextEncode,
    ConditioningZeroOut,
    CLIPVisionLoader,
    CLIPVisionEncode,
    VAEEncode,
    VAEDecodeTiled,
    LoadImage,
    DualCLIPLoader,
    ImageScale
)

from custom_nodes.ComfyUI_KJNodes.nodes.image_nodes import (
    ColorMatch,
    GetImageSizeAndCount
)

from custom_nodes.ComfyUI_FramePackWrapper.nodes import (
    LoadFramePackModel,
    FramePackTorchCompileSettings,
    FramePackSampler,
    FramePackFindNearestBucket
)

from pathlib import Path

def model_download(url: str, dest_dir: str, filename: str = None, silent: bool = True) -> bool:
    """
    Colab-optimized download with aria2c

    Args:
        url: Download URL
        dest_dir: Target directory (will be created if needed)
        filename: Optional output filename (defaults to URL filename)
        silent: If True, suppresses all output (except errors)

    Returns:
        bool: True if successful, False if failed
    """
    try:
        # Create destination directory
        Path(dest_dir).mkdir(parents=True, exist_ok=True)

        # Set filename if not specified
        if filename is None:
            filename = url.split('/')[-1].split('?')[0]  # Remove URL parameters

        # Build command
        cmd = [
            'aria2c',
            '--console-log-level=error',
            '-c', '-x', '16', '-s', '16', '-k', '1M',
            '-d', dest_dir,
            '-o', filename,
            url
        ]

        # Add silent flags if requested
        if silent:
            cmd.extend(['--summary-interval=0', '--quiet'])
            print(f"Downloading {filename}...", end=' ', flush=True)

        # Run download
        result = subprocess.run(cmd, check=True, capture_output=True, text=True)

        if silent:
            print("Done!")
        else:
            print(f"Downloaded {filename} to {dest_dir}")
        return filename

    except subprocess.CalledProcessError as e:
        error = e.stderr.strip() or "Unknown error"
        print(f"\nError downloading {filename}: {error}")
        return False
    except Exception as e:
        print(f"\nError: {str(e)}")
        return False


llama = model_download("https://huggingface.co/Comfy-Org/HunyuanVideo_repackaged/resolve/main/split_files/text_encoders/llava_llama3_fp8_scaled.safetensors", "/content/ComfyUI/models/text_encoders")
# llama = model_download("https://huggingface.co/Comfy-Org/HunyuanVideo_repackaged/resolve/main/split_files/text_encoders/llava_llama3_fp16.safetensors", "/content/ComfyUI/models/text_encoders")

clip_l = model_download("https://huggingface.co/Comfy-Org/HunyuanVideo_repackaged/resolve/main/split_files/text_encoders/clip_l.safetensors", "/content/ComfyUI/models/text_encoders")

vae_model = model_download("https://huggingface.co/Comfy-Org/HunyuanVideo_repackaged/resolve/main/split_files/vae/hunyuan_video_vae_bf16.safetensors", "/content/ComfyUI/models/vae")

clip_vision_model = model_download("https://huggingface.co/Comfy-Org/sigclip_vision_384/resolve/main/sigclip_vision_patch14_384.safetensors", "/content/ComfyUI/models/clip_vision")

dit_model = model_download("https://huggingface.co/Kijai/HunyuanVideo_comfy/resolve/main/FramePackI2V_HY_fp8_e4m3fn.safetensors", "/content/ComfyUI/models/diffusion_models")


def clear_memory():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()
    for obj in list(globals().values()):
        if torch.is_tensor(obj) or (hasattr(obj, "data") and torch.is_tensor(obj.data)):
            del obj
    gc.collect()

def save_as_mp4(images, filename_prefix, fps, output_dir="/content/ComfyUI/output"):
    os.makedirs(output_dir, exist_ok=True)
    output_path = f"{output_dir}/{filename_prefix}.mp4"

    frames = [(img.cpu().numpy() * 255).astype(np.uint8) for img in images]

    with imageio.get_writer(output_path, fps=fps) as writer:
        for frame in frames:
            writer.append_data(frame)

    return output_path

def upload_image():
    """Handle image upload in Colab and store in /content/ComfyUI/input/"""
    from google.colab import files
    import os
    import shutil

    os.makedirs('/content/ComfyUI/input', exist_ok=True)

    uploaded = files.upload()

    # Move each uploaded file to ComfyUI input directory
    for filename in uploaded.keys():
        src_path = f'/content/ComfyUI/{filename}'
        dest_path = f'/content/ComfyUI/input/{filename}'

        shutil.move(src_path, dest_path)
        print(f"Image saved to: {dest_path}")
        return dest_path

    return None


def generate_video(
    image_path: str = None,
    prompt: str = "The anime girl dances from side to side with fast motion, high action.",
    width: int = 512,
    height: int = 512,
    base_resolution: int = 640,
    seed: int = 31728721979046,
    steps: int = 30,
    cfg_scale: float = 1.0,
    frame_rate: int = 24,
    latent_window_size: int = 9,
    total_seconds: float = 5.0,
    sampler_name: str = "unipc_bh1",
    shift: float = 6.0,
    guidance_scale: float = 10.0,
    embed_interpolation: str = "linear",
    start_embed_strength: float = 0.5,
    denoise_strength: float = 1.0,
    use_teacache: bool = True,
    teacache_rel_l1_thresh: float = 0.15,
    gpu_memory_preservation: float = 6.0

):
    with torch.inference_mode():

        print("Loading Text encoders...")
        clip_loader = DualCLIPLoader()
        clip = clip_loader.load_clip(
            clip_l,
            llama,
            "hunyuan_video"
        )[0]

        text_encoder = CLIPTextEncode()
        positive = text_encoder.encode(clip, prompt)[0]

        del clip
        torch.cuda.empty_cache()
        gc.collect()

        zero_out = ConditioningZeroOut()
        negative = zero_out.zero_out(positive)[0]

        if image_path is None:
            print("Please upload an image file:")
            image_path = upload_image()
        if image_path is None:
            print("No image uploaded!")
        load_image = LoadImage()
        loaded_image = load_image.load_image(image_path)[0]


        bucket_finder = FramePackFindNearestBucket()
        width, height = bucket_finder.process(loaded_image, base_resolution)[:2]


        # Scale image to bucket size
        print(f"Scaling image to {width}x{height}...")
        image_scaler = ImageScale()
        scaled_image = image_scaler.upscale(
            loaded_image,
            "lanczos",
            width,
            height,
            "disabled"
        )[0]

        # Get image size info
        print("Getting image dimensions...")
        size_getter = GetImageSizeAndCount()
        size_output = size_getter.getsize(image=scaled_image)
        processed_image = size_output["result"][0]
        # width = size_output["result"][1]
        # height = size_output["result"][2]
        # count = size_output["result"][3]

        print("Loading CLIP vision model...")
        vision_loader = CLIPVisionLoader()
        clip_vision = vision_loader.load_clip(clip_vision_model)[0]

        vision_encoder = CLIPVisionEncode()
        clip_vision_output = vision_encoder.encode(clip_vision, processed_image, "center")[0]

        del clip_vision
        torch.cuda.empty_cache()
        gc.collect()


        # Load VAE
        print("Loading VAE...")
        vae_loader = VAELoader()
        vae = vae_loader.load_vae(vae_model)[0]

        vae_encoder = VAEEncode()
        latent = vae_encoder.encode(vae, processed_image)[0]

        compile_settings = FramePackTorchCompileSettings()
        compile_args = compile_settings.loadmodel(
            "inductor",
            False,
            "default",
            False,
            64,
            True,
            True
        )[0]

        # Load FramePack model
        print("Loading FramePack model...")
        model_loader = LoadFramePackModel()
        model = model_loader.loadmodel(
            dit_model,
            "bf16",
            "fp8_e4m3fn",
            compile_args,
            "sageattn"
        )[0]

        # Run FramePack sampler
        print("Running FramePack Video Generation...")
        sampler = FramePackSampler()
        samples = sampler.process(
            model=model,
            shift=shift,  # Shift parameter
            positive=positive,
            negative=negative,
            latent_window_size=latent_window_size,
            use_teacache=use_teacache,
            total_second_length=total_seconds,
            teacache_rel_l1_thresh=teacache_rel_l1_thresh,
            image_embeds=clip_vision_output,
            steps=steps,
            cfg=cfg_scale,
            guidance_scale=guidance_scale,
            seed=seed,
            sampler=sampler_name,
            gpu_memory_preservation=gpu_memory_preservation,
            start_latent=latent,
            end_latent=None,
            end_image_embeds=None,
            embed_interpolation=embed_interpolation,
            start_embed_strength=start_embed_strength,
            initial_samples=None,
            denoise_strength=denoise_strength
        )[0]

        del model
        torch.cuda.empty_cache()
        gc.collect()

        try:

            print("Decoding video frames...")
            vae_decoder = VAEDecodeTiled()
            decoded_frames = vae_decoder.decode(
                vae,
                samples,
                256,
                64,
                64,
                8
            )[0]

            del vae
            torch.cuda.empty_cache()
            gc.collect()

            output_path = ""
            import datetime
            base_name = "ComfyUI"

            # Color match with reference image
            print("Color matching frames...")
            color_matcher = ColorMatch()
            color_matched = color_matcher.colormatch(
                image_ref=loaded_image,
                image_target=decoded_frames,
                method="mkl",
                strength=1
            )[0]

            print("Saving as MP4...")
            output_path = save_as_mp4(color_matched, base_name, frame_rate)
            display_video(output_path)
        except Exception as e:
            print(f"Error during decoding/saving: {str(e)}")
            raise
        finally:
            clear_memory()

def display_video(video_path):
    from IPython.display import HTML
    from base64 import b64encode

    video_data = open(video_path,'rb').read()

    # Determine MIME type based on file extension
    if video_path.lower().endswith('.mp4'):
        mime_type = "video/mp4"
    elif video_path.lower().endswith('.webm'):
        mime_type = "video/webm"
    elif video_path.lower().endswith('.webp'):
        mime_type = "image/webp"
    else:
        mime_type = "video/mp4"  # default

    data_url = f"data:{mime_type};base64," + b64encode(video_data).decode()

    display(HTML(f"""
    <video width=512 controls autoplay loop>
        <source src="{data_url}" type="{mime_type}">
    </video>
    """))

print("✅ Environment Setup Complete!")



In [2]:
# @title Generate Video


# @markdown ---
# @markdown ### Video Settings
positive_prompt = "The anime girl dances flamboyantly, swinging her hips and striking bold poses with dramatic flair" # @param {"type":"string"}
width = 512 # @param {"type":"number"}
height = 512 # @param {"type":"number"}
base_resolution = 512 # @param {"type":"slider","min":64,"max":2048,"step":16}
seed = 0 # @param {"type":"number"}
steps = 20 # @param {"type":"slider","min":0,"max":100,"step":1}
cfg_scale = 1 # @param {"type":"slider","min":1,"max":20,"step":0.1}
latent_window_size = 9 # @param {"type":"slider","min":1,"max":33,"step":1}
sampler_name = "unipc_bh1" # @param ["unipc_bh1","unipc_bh2","uni_pc", "euler", "dpmpp_2m", "ddim", "lms"]
# scheduler = "simple" # @param ["simple", "normal", "karras", "exponential"]
total_seconds = 5 # @param {"type":"integer", "min":1, "max":120}
shift = 6.0 # @param {"type":"slider","min":0,"max":1000,"step":0.01}
guidance_scale = 10 # @param {"type":"slider","min":0,"max":32,"step":0.01}
embed_interpolation = "linear" # @param ["linear","weighted_average"]
start_embed_strength = 0.5 # @param {"type":"slider","min":0,"max":1,"step":0.01}
denoise_strength = 1.0 # @param {"type":"slider","min":0,"max":1,"step":0.01}
use_teacache = False # @param {type:"boolean"}
teacache_rel_l1_thresh = 0.15 # @param {"type":"slider","min":0.0,"max":1.0,"step":0.01}
gpu_memory_preservation = 6.0 # @param {"type":"slider","min":0.0,"max":128.0,"step":0.1}

import random
seed = seed if seed != 0 else random.randint(0, 2**32 - 1)
print(f"Using seed: {seed}")


generate_video(
    image_path=None,
    prompt=positive_prompt,
    width=width,
    height=height,
    base_resolution = base_resolution,
    seed=seed,
    steps=steps,
    cfg_scale=cfg_scale,
    frame_rate = 24,
    latent_window_size = latent_window_size,
    total_seconds = total_seconds,
    sampler_name = sampler_name,
    shift = shift,
    guidance_scale = guidance_scale,
    embed_interpolation = embed_interpolation,
    start_embed_strength = start_embed_strength,
    denoise_strength = denoise_strength,
    use_teacache = use_teacache,
    teacache_rel_l1_thresh = teacache_rel_l1_thresh,
    gpu_memory_preservation = gpu_memory_preservation
)

clear_memory()





Using seed: 2676343448
Loading Text encoders...




Please upload an image file:


Saving flux_dev_example.png to flux_dev_example.png
Image saved to: /content/ComfyUI/input/flux_dev_example.png
Resolution: 480 x 512
Scaling image to 480x512...
Getting image dimensions...
Loading CLIP vision model...
Loading VAE...
Loading FramePack model...
Using accelerate to load and assign model weights to device...


Loading transformer parameters to cpu: 100%|██████████| 1274/1274 [00:00<00:00, 10644.89it/s]


Running FramePack Video Generation...
total_latent_sections:  4
start_latent torch.Size([1, 16, 1, 64, 60])
Moving DynamicSwap_HunyuanVideoTransformer3DModelPacked to cuda:0 with preserved memory: 6.0 GB
latent_padding: 3
latent_padding_size = 27, is_last_section = False, is_first_section = True


  0%|          | 0/20 [00:00<?, ?it/s]

W0424 18:32:19.875000 720 torch/_inductor/utils.py:1137] [0/0] Not enough SMs to use max_autotune_gemm mode
W0424 18:32:49.505000 720 torch/_dynamo/convert_frame.py:906] [0/8] torch._dynamo hit config.cache_size_limit (8)
W0424 18:32:49.505000 720 torch/_dynamo/convert_frame.py:906] [0/8]    function: 'forward' (/content/ComfyUI/custom_nodes/ComfyUI_FramePackWrapper/diffusers_helper/models/hunyuan_video_packed.py:623)
W0424 18:32:49.505000 720 torch/_dynamo/convert_frame.py:906] [0/8]    last reason: 0/0: ___check_type_id(L['self'], 997604448)                      
W0424 18:32:49.505000 720 torch/_dynamo/convert_frame.py:906] [0/8] To log all recompilation reasons, use TORCH_LOGS="recompiles".
W0424 18:32:49.505000 720 torch/_dynamo/convert_frame.py:906] [0/8] To diagnose recompilation issues, see https://pytorch.org/docs/main/torch.compiler_troubleshooting.html.
W0424 18:33:08.018000 720 torch/_dynamo/convert_frame.py:906] [1/8] torch._dynamo hit config.cache_size_limit (8)
W0424 18:3

latent_padding: 2
latent_padding_size = 18, is_last_section = False, is_first_section = False


  0%|          | 0/20 [00:00<?, ?it/s]

latent_padding: 1
latent_padding_size = 9, is_last_section = False, is_first_section = False


  0%|          | 0/20 [00:00<?, ?it/s]

latent_padding: 0
latent_padding_size = 0, is_last_section = True, is_first_section = False


  0%|          | 0/20 [00:00<?, ?it/s]

Decoding video frames...
Color matching frames...
Saving as MP4...
