In [None]:
import os
import torch
import random
import torch.nn as nn
import torch.backends.cudnn as cudnn
import numpy as np
from models import build_model
from PIL import Image
from IPython.display import Image as ipython_image
from diffusers.utils import load_image, export_to_video, export_to_gif

#### Building the Video-LaVIT Model and Load the checkpoint

In [None]:
# The local directory to save Video-LaVIT checkpoint
model_path = "/home/jinyang06/models/VideoLaVIT-v1"
model_dtype='fp16'

seed = 42
random.seed(seed)
torch.manual_seed(seed)

# Set the load GPU id
device_id = 0
torch.cuda.set_device(device_id)
device = torch.device('cuda')

# If you have already install xformers, set `use_xformers=True` to save the GPU memory (Xformers is not supported on V100 GPU)
# If you have already download the checkpoint, set `local_files_only=True`` to avoid auto-downloading from remote
model = build_model(model_path=model_path, model_dtype=model_dtype, local_files_only=True, 
                device_id=device_id, use_xformers=True, understanding=False,)
model = model.to(device)

print("Building Model Finsished")
torch_dtype = torch.bfloat16 if model_dtype=="bf16" else torch.float16

### Text-to-Video Generation

Video-LaVIT is trained on the open-sourced Webvid-10M dataset, where the videos have watermark. Therefore, it may generate a keyframe with a watermark and infects the aesthetic of the generated video. If you want to generate video without watermark, we recommend you to try the video_generation_aest.ipynb, where we intervene the keyframe by a text-to-image model.

In [None]:
prompt = "FPV drone footage of an ancient city in autumn"
# prompt = 'A steaming cup of coffee with mountains in the background. Resting during road trip'
# prompt = 'Bloomming cherry tree in the garden beautiful sun light'
# prompt = 'Golden retriever puppy running in the park. Autumn. Beautiful leaves on the ground'
# prompt = 'Back view on young woman dressed in a bright yellow jacket walk in outdoor forest'
# prompt = 'Sailboat sailing on a sunny day in a mountain lake'
# prompt = "Onboard camera view of bike riding on country asphalt road through picturesque villages"
# prompt = "A man was riding his motorcycle on the highway"
# prompt = "A dog driving a car on a suburban street wearing funny sunglasses"
# prompt = "A boat sailing leisurely along the Seine River with the Eiffel Tower in background by Vincent van Gogh"
# prompt = "A spaceship is slowly descending"
# prompt = "A girl is writing something on a book, Oil painting style"
# prompt = "Waves crashing against a lone lighthouse, ominous lighting"
# prompt = "A bright fire burns in the stone oven"
# prompt = "Aurora Borealis Green Loop Winter Mountain Ridges Northern Lights"
# prompt = "a white swan moving on the lake"

# The keyframe aspect ratio you want to generate
# Only support two ratios: square and widescreen video
ratio_dict = {
    '1:1' : (1024, 1024),
    '1:2' : (576, 1024),
}

print(prompt)

ratio = '1:2'
height, width = ratio_dict[ratio]

# The video width and height should has the same aspect ratio with the generated keyframe
# Generated high resolution video requires more GPU memory, you can choose to lower the resolution.
# e.g., set video_width=576, video_height = 320 for 1:2;  video_width=512, video_height = 512 for 1:1

if ratio == '1:2':
    video_width = 896
    video_height = 512
    # video_width = 576
    # video_height = 320
else:
    assert ratio == '1:1'
    video_width = 768
    video_height = 768
    # video_width = 512
    # video_height = 512


with torch.cuda.amp.autocast(enabled=True, dtype=torch_dtype):
    videos, keyframes = model.generate_video(prompt, width=width, height=height, num_return_images=1, 
            video_width=video_width, video_height=video_height, guidance_scale_for_llm=4.0, 
            guidance_scale_for_decoder=7.0, num_inference_steps=50, top_k=50,)


output_video_path = "generated.gif"
export_to_gif(videos[0], output_video_path)
display(ipython_image(open(output_video_path,'rb').read()))

### Image-to-Video Generation

In [None]:
image_path = 'demo/catus1.jpg'
# image_path = 'demo/scene.jpg'
# image_path = 'demo/girl.jpg'

image = Image.open(image_path).convert("RGB")
display(image)
input_prompts = [(image, 'image')]

if image.width > image.height:
    video_width = 896; video_height = 512
else:
    video_width = 768; video_height = 768

with torch.cuda.amp.autocast(enabled=True, dtype=torch_dtype):
    videos, _ = model.multimodal_video_generate(input_prompts, video_width=video_width, video_height=video_height, 
            guidance_scale_for_llm=4.0, top_k=50,)

output_video_path = "generated.gif"
export_to_gif(videos[0], output_video_path)
display(ipython_image(open(output_video_path,'rb').read()))

### Long Video Generation

It will takes long time to generate long video; We set the video_width and video_height to low resolution (576, 320) by default; The clip_num is the generated video clip numbers. Since the model is trained on video less than 10s, when clip number > 3, the video quality severely degradates.

In [None]:
prompt = 'Back view on young woman dressed in a bright yellow jacket walk in outdoor forest'
# prompt = "Onboard camera view of bike riding on country asphalt road through picturesque villages"
# prompt = 'Sailboat sailing on a sunny day in a mountain lake'
# prompt = 'A steaming cup of coffee with mountains in the background. Resting during road trip'
# prompt = "A bright fire burns in the stone oven"
# prompt = "A dog in the sun"


ratio_dict = {
    '1:1' : (1024, 1024),
    '1:2' : (576, 1024),
}

print(prompt)

# The keyframe aspect ratio you want to generate
ratio = '1:2'
height, width = ratio_dict[ratio]

# The video width and height should has the same aspect ratio with the generated keyframe
if ratio == '1:2':
    # video_width = 1024
    # video_height = 576
    video_width = 576
    video_height = 320
else:
    assert ratio == '1:1'
    # video_width = 768
    # video_height = 768
    video_width = 512
    video_height = 512

# The generated video clip numbers
clip_num = 2

with torch.cuda.amp.autocast(enabled=True, dtype=torch_dtype):
    videos, frames = model.generate_video(prompt, width=width, height=height, num_return_images=1, 
        video_width=576, video_height=320, guidance_scale_for_llm=4.0, guidance_scale_for_decoder=7.0, 
        num_inference_steps=50, top_k=50, clip_num=clip_num, inverse_rate=0.9,
    )

clip_videos = videos[0][:24]
for i_clip in range(1, clip_num):
    clip_videos += videos[0][i_clip * 24 + 1:i_clip * 24 + 24]

output_video_path = "generated.gif"
export_to_gif(clip_videos, output_video_path)
display(ipython_image(open(output_video_path,'rb').read()))


### Text-to-image Generation

In [None]:
prompt = "a high contrast photo of an astronaut riding a horse in the forest."
# prompt = "A high contrast photo of panda dressed as an astronaut sits at a table in a photorealistic style"
# prompt = "a sculpture of a duck made of wool"
# prompt = "Cute adorable little goat, unreal engine, cozy interior lighting, art station, detailed digital painting, cinematic, octane rendering"
# prompt = 'a super math wizard cat, richly textured oil painting'
# prompt = "A oil painting of a female painter with a brush in hand, white background, painting, looking very powerful"


ratio_dict = {
    '1:1' : (1024, 1024),
    '4:3' : (896, 1152),
    '3:2' : (832, 1216),
    '16:9' : (768, 1344),
    '2:3' : (1216, 832),
    '3:4' : (1152, 896),
    '1:2' : (576, 1024),
}

print(prompt)
ratio = '1:1'
height, width = ratio_dict[ratio]

with torch.cuda.amp.autocast(enabled=True, dtype=torch_dtype):
    images = model.generate_image(prompt, width=width, height=height, num_return_images=1, 
        guidance_scale_for_llm=4.0, guidance_scale_for_decoder=7.0, num_inference_steps=50, top_k=50, temperature=1.0)

display(images[0])