# 🎬 Stable Video Diffusion

Generate high-quality videos from text descriptions using Stability AI's Stable Video Diffusion model. This notebook runs in Colab's free tier and requires no API key.

## Model Information
This notebook supports both SVD and SVD-XT models:

- **SVD (Default)**: Original model, good for most use cases
  - 14 frames at 1024x576 resolution
  - Consistent motion and quality
  - Faster generation time

- **SVD-XT**: Extended version with improvements
  - Up to 25 frames at 1024x576
  - Better motion consistency
  - Improved visual quality
  - Support for longer sequences

## Features
- Text-to-video generation
- Multiple video styles
- Adjustable generation parameters
- Frame interpolation for smooth motion
- Support for both SVD and SVD-XT models

## Setup
First, let's install the required packages:

In [None]:
!pip install -q diffusers transformers accelerate torch gradio moviepy

## Import Dependencies

In [None]:
import torch
from diffusers import StableVideoDiffusionPipeline
from diffusers.utils import export_to_video
import gradio as gr
import os

## Load Models

In [None]:
# Available models
MODELS = {
    "SVD": "stabilityai/stable-video-diffusion-img2vid",
    "SVD-XT": "stabilityai/stable-video-diffusion-img2vid-xt"
}

def load_model(model_name="SVD"):
    pipe = StableVideoDiffusionPipeline.from_pretrained(
        MODELS[model_name],
        torch_dtype=torch.float16,
        variant="fp16"
    )
    pipe.to("cuda")
    
    # Enable memory efficient features
    pipe.enable_model_cpu_offload()
    pipe.enable_vae_slicing()
    
    return pipe

# Initialize with default model
pipe = load_model()

print(f"Model loaded on: {'GPU' if torch.cuda.is_available() else 'CPU'}")

## Create Generation Function

In [None]:
def generate_video(prompt, model_name="SVD", num_frames=14, fps=8, 
                  motion_bucket_id=127, noise_aug_strength=0.1,
                  min_guidance_scale=1.0, max_guidance_scale=3.0,
                  seed=-1):
    global pipe
    
    # Load selected model if different from current
    if model_name != getattr(pipe, "_model_name", "SVD"):
        pipe = load_model(model_name)
        pipe._model_name = model_name
    
    # Set random seed if provided
    if seed != -1:
        torch.manual_seed(seed)
    
    # Adjust frames based on model
    if model_name == "SVD" and num_frames > 14:
        num_frames = 14
    elif model_name == "SVD-XT" and num_frames > 25:
        num_frames = 25
    
    # Generate video frames
    frames = pipe(
        prompt,
        num_inference_steps=25,
        num_frames=num_frames,
        motion_bucket_id=motion_bucket_id,
        noise_aug_strength=noise_aug_strength,
        min_guidance_scale=min_guidance_scale,
        max_guidance_scale=max_guidance_scale
    ).frames[0]
    
    # Save video
    output_path = "output.mp4"
    export_to_video(frames, output_path, fps=fps)
    
    return output_path

## Create Gradio Interface

In [None]:
with gr.Blocks() as interface:
    gr.Markdown(
        """
        # Stable Video Diffusion
        Generate videos from text descriptions using Stability AI's Stable Video Diffusion model.
        Choose between SVD (original) and SVD-XT (extended) models.
        """
    )
    
    with gr.Row():
        with gr.Column(scale=2):
            prompt = gr.Textbox(
                label="Prompt",
                placeholder="Describe the video you want to generate...",
                lines=3
            )
            
            model_choice = gr.Radio(
                choices=["SVD", "SVD-XT"],
                value="SVD",
                label="Model",
                info="SVD-XT supports longer sequences but may be slower"
            )
            
            with gr.Row():
                num_frames = gr.Slider(
                    minimum=8,
                    maximum=25,
                    value=14,
                    step=1,
                    label="Number of Frames",
                    info="SVD: max 14, SVD-XT: max 25"
                )
                fps = gr.Slider(
                    minimum=4,
                    maximum=30,
                    value=8,
                    step=1,
                    label="FPS",
                    info="Frames per second"
                )
            
            with gr.Accordion("Advanced Settings", open=False):
                motion_bucket_id = gr.Slider(
                    minimum=1,
                    maximum=255,
                    value=127,
                    step=1,
                    label="Motion Strength",
                    info="Higher values create more motion"
                )
                noise_aug_strength = gr.Slider(
                    minimum=0.0,
                    maximum=1.0,
                    value=0.1,
                    step=0.1,
                    label="Noise Strength",
                    info="Higher values create more varied motion"
                )
                with gr.Row():
                    min_guidance = gr.Slider(
                        minimum=1.0,
                        maximum=10.0,
                        value=1.0,
                        step=0.5,
                        label="Min Guidance Scale"
                    )
                    max_guidance = gr.Slider(
                        minimum=1.0,
                        maximum=10.0,
                        value=3.0,
                        step=0.5,
                        label="Max Guidance Scale"
                    )
                seed = gr.Number(
                    value=-1,
                    label="Random Seed",
                    info="Set to -1 for random results"
                )
        
        with gr.Column(scale=2):
            output = gr.Video(label="Generated Video")
    
    gr.Examples([
        ["A serene lake at sunset with gentle ripples on the water", "SVD", 14, 8, 127, 0.1, 1.0, 3.0, -1],
        ["A blooming flower opening its petals in timelapse", "SVD-XT", 25, 12, 200, 0.2, 1.0, 3.0, -1],
        ["A space shuttle launching into the sky with smoke trails", "SVD-XT", 20, 15, 255, 0.3, 1.0, 3.0, -1]
    ], [prompt, model_choice, num_frames, fps, motion_bucket_id, noise_aug_strength, 
        min_guidance, max_guidance, seed])
    
    inputs = [prompt, model_choice, num_frames, fps, motion_bucket_id, 
              noise_aug_strength, min_guidance, max_guidance, seed]
    
    gr.Interface(fn=generate_video, inputs=inputs, outputs=output)

interface.launch(share=True)