<a href="https://colab.research.google.com/github/rkhamilton/vqgan-clip-generator/blob/main/vqgan_clip_generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **VQGAN_CLIP_GENERATOR**
An easily installable VQGAN+CLIP image generator with a focus on restyling (style transfer) existing videos.  
Documentation is provided at [the project home page](https://github.com/rkhamilton/vqgan-clip-generator).

The vqgan_clip_genrator package was developed primarily for ease of use, ease of installation on local computers, and to facilitate development of a different way to restyle videos.

This package is compatible with all of the usual trained models that work with VQGAN (sflickr, coco, etc). Code is provided here only for ImageNet for simplicity.


In [None]:
#@title MIT License
# 

# Copyright (c) 2021 Ryan Hamilton

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.

In [None]:
#@markdown What GPU am I using?

#@markdown V100 > P100 > everything else

!nvidia-smi --query-gpu=gpu_name,memory.total,gpu_bus_id,vbios_version --format=csv
gpu_name = !nvidia-smi --query-gpu=gpu_name, --format=csv

# **Filesystem Setup**
Run either the Local Filesystem or Google Drive blocks to set your root location.

In [None]:
#@title Connect Google Drive
#@markdown This option connects to your google drive, and saves model files and output to that location for re-use.
import os
abs_root_path = "/content"

from google.colab import drive
drive.mount('/content/drive')

def ensureProperRootPath():
    if len(abs_root_path) > 0:
        os.chdir(abs_root_path) # Changes directory to absolute root path
        print("Root path check: ")
        !pwd

ensureProperRootPath()

def make_folder(folder_name):
  if len(folder_name) > 0:
      path_tmp = os.path.join("/content/drive/MyDrive/",folder_name)
      if not os.path.exists(path_tmp):
          os.mkdir(path_tmp)
      return path_tmp


project_folder_name = "VQGAN_CLIP_GENERATOR" #@param {type: "string"}
abs_root_path = make_folder(project_folder_name)

print("Created folder & set root path to: " + abs_root_path)

In [None]:
#@title Use Local Filesystem
#@markdown If you use the local filesystem, all data will be lost at the
#@markdown end of the session. VQGAN models will have to be re-downloaded.
import os
abs_root_path = "/content"
def ensureProperRootPath():
    if len(abs_root_path) > 0:
        os.chdir(abs_root_path) # Changes directory to absolute root path
        print("Root path check: ")
        !pwd

ensureProperRootPath()
print("Your root directory is: ")
!pwd

def make_folder(folder_name):
  abs_root_path = "/content"
  if len(folder_name) > 0:
      path_tmp = abs_root_path + "/drive/MyDrive/" + folder_name
      if not os.path.exists(path_tmp):
          os.mkdir(path_tmp)
      return path_tmp


# **Set Paths and Download Dependencies**
Run either the Local Filesystem or Google Drive blocks to set your root location.

In [None]:
#@title Download dependencies

# !conda create --name vqgan python=3.9 pip ffmpeg numpy pytest tqdm git pytorch==1.9.0 torchvision==0.10.0 torchaudio==0.9.0 cudatoolkit=11.1 -c pytorch -c conda-forge
# !conda activate vqgan
!pip install git+https://github.com/openai/CLIP.git
!pip install git+https://github.com/rkhamilton/vqgan-clip-generator.git

In [None]:
#@title Define Paths
extracted_video_frames_dir = os.path.join(abs_root_path, "extracted_video_frames")
generated_video_frames_dir = os.path.join(abs_root_path, "generated_video_frames")
output_dir = os.path.join(abs_root_path, "output")
models_dir = os.path.join(abs_root_path, "models")

make_folder(extracted_video_frames_dir)
make_folder(generated_video_frames_dir)
make_folder(output_dir)
make_folder(models_dir)

In [None]:
#@title Download pre-trained models

if not os.path.exists(models_dir):
          os.mkdir(models_dir)
os.chdir(models_dir)

model_ckpt_file = os.path.join(models_dir, "vqgan_imagenet_f16_16384.ckpt")
model_yaml_file = os.path.join(models_dir, "vqgan_imagenet_f16_16384.yaml")

if not os.path.exists(model_ckpt_file):
  !curl -L -o vqgan_imagenet_f16_16384.ckpt -C - 'https://heibox.uni-heidelberg.de/f/867b05fc8c4841768640/?dl=1'
else:
  print('Checkpoint file found.')
if not os.path.exists(model_yaml_file):
  !curl -L -o vqgan_imagenet_f16_16384.yaml -C - 'https://heibox.uni-heidelberg.de/f/274fb24ed38341bfa753/?dl=1'
else:
  print('Model configuration file found.')
os.chdir(abs_root_path)

In [None]:
#@title Import libraries
from vqgan_clip import generate, video_tools
from vqgan_clip.engine import VQGAN_CLIP_Config
import os
from IPython import display

# **Generate Images and Video**

In [16]:
#@title Generate a single image from a prompt
config = VQGAN_CLIP_Config()
config.vqgan_config = model_yaml_file
config.vqgan_checkpoint = model_ckpt_file
output_image_width_pixels = 512 #@param {type: "integer"}
output_image_height_pixels = 512 #@param {type: "integer"}
config.output_image_size = [output_image_width_pixels,output_image_height_pixels]
text_prompts = 'A pastoral landscape painting by Rembrandt' #@param {type: "string"}
iterations = 100 #@param {type: "integer"}
output_file_path_no_ext = os.path.join(output_dir, text_prompts)
generate.single_image(eng_config = config,
        text_prompts = text_prompts,
        iterations = iterations,
        save_every = None,
        output_filename = 'output' + os.sep + text_prompts)

print(output_file_path_no_ext + '.png')
display.display(display.Image(output_file_path_no_ext + '.png'))

Working with z of shape (1, 256, 16, 16) = 65536 dimensions.
loaded pretrained LPIPS loss from taming/modules/autoencoder/lpips/vgg.pth
VQLPIPSWithDiscriminator running with hinge loss.
Restored from /content/drive/MyDrive/VQGAN_CLIP_GENERATOR/models/vqgan_imagenet_f16_16384.ckpt


100%|██████████| 100/100 [03:20<00:00,  2.00s/iteration]


'A pastoral landscape painting by Rembrandt.png'

In [41]:
#@title Generate a zoom video
#@markdown Generate a video with movement. Every frame that is generated has a shift or zoom applied to it.
#@markdown This gives the appearance of motion in the result.
#@markdown This is one of the most interesting application of VQGAN+CLIP here.

config = VQGAN_CLIP_Config()
text_prompts = 'A pastoral landscape painting by Rembrandt | wolves chasing sheep' #@param {type: "string"}
output_image_width_pixels = 512 #@param {type: "integer"}
output_image_height_pixels = 512 #@param {type: "integer"}
config.output_image_size = [output_image_width_pixels,output_image_height_pixels]
iterations = 2000 #@param {type: "integer"}
save_every = 5 #@param {type: "integer"}
output_filename = output_file_path_no_ext
change_prompt_every = 300 #@param {type: "integer"}
output_framerate=30  #@param {type: "integer"}
assumed_input_framerate=10 #@param {type: "integer"}
zoom_scale=1.02  #@param {type: "number"}
shift_x=1 #@param {type: "integer"}
shift_y=1 #@param {type: "integer"}
output_filename_no_extension = 'zoom_video' #@param {type: "string"}

output_file_path_no_ext = os.path.join(output_dir, output_filename_no_extension)

from vqgan_clip import generate, video_tools
from vqgan_clip.engine import VQGAN_CLIP_Config
import os

#Let's generate a single image to initialize the video.
config = VQGAN_CLIP_Config()
config.output_image_size = [448,448]
init_image = os.path.join('output','init_image')
generate.single_image(eng_config = config,
        text_prompts = text_prompts,
        iterations = 100,
        save_every = None,
        output_filename = init_image)

# Now generate a zoom video starting from that initial frame.
config.init_image = init_image+'.png'
generate.zoom_video_frames(eng_config = config,
        text_prompts = text_prompts,
        iterations = iterations,
        save_every = save_every,
        change_prompt_every = change_prompt_every,
        zoom_scale=zoom_scale, 
        shift_x=shift_x, 
        shift_y=shift_y)

video_tools.encode_video(output_file=output_file_path_no_ext+'.mp4',
        metadata=text_prompts,
        output_framerate=output_framerate,
        assumed_input_framerate=assumed_input_framerate)

# Download video
from google.colab import files
files.download(output_file_path_no_ext + '.mp4')

Working with z of shape (1, 256, 16, 16) = 65536 dimensions.
loaded pretrained LPIPS loss from taming/modules/autoencoder/lpips/vgg.pth
VQLPIPSWithDiscriminator running with hinge loss.
Restored from models/vqgan_imagenet_f16_16384.ckpt


  0%|          | 3/2000 [00:12<2:14:40,  4.05s/iteration]

Creating interpolated frames...





<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
#@title Style transfer to an existing video
#@markdown An existing video will have a VQGAN style applied to each frame.
#@markdown Refer to the [documentation](https://github.com/rkhamilton/vqgan-clip-generator/blob/main/README.md) 
#@markdown for an explanation of the options.

config = VQGAN_CLIP_Config()
input_video_path = 'input_video.MOV' #@param {type: "string"}
output_video_path = 'output_video.MOV' #@param {type: "string"}
input_video_extraction_framerate = 30 #@param {type: "integer"
text_prompts = 'portrait on deviantart' #@param {type: "string"}
output_image_width_pixels = 512 #@param {type: "integer"}
output_image_height_pixels = 512 #@param {type: "integer"}
iterations = 50 #@param {type: "integer"}
save_every = 5 #@param {type: "integer"}
config.init_weight = 1.0 #@param {type: "number"}
current_source_frame_prompt_weight=0.1 #@param {type: "number"}
previous_generated_frame_prompt_weight=0.0 #@param {type: "number"}
generated_frame_init_blend=0.2 #@param {type: "number"}
change_prompt_every = 300 #@param {type: "integer"}
output_framerate=30  #@param {type: "integer"}
assumed_input_framerate=10 #@param {type: "integer"}
output_filename_no_extension = 'restyled_video' #@param {type: "string"}
output_framerate = 60 #@param {type: "integer"}
copy_audio=False #@param {type: "boolean"}

config.output_image_size = [output_image_width_pixels,output_image_height_pixels]
print(input_video_path)

final_output_filename = os.path.join('output','output.mp4')
copy_audio = True
extraction_framerate = 30
output_framerate = 60

# Use a wrapper for FFMPEG to extract stills from the original video.
original_video_frames = video_tools.extract_video_frames(input_video_path, 
        extraction_framerate = extraction_framerate)

# Apply a style to the extracted video frames.
generate.restyle_video_frames(original_video_frames,
        eng_config=config,
        text_prompts = text_prompts,
        video_frames_path = generated_video_frames_dir,
        iterations = iterations,
        save_every=None,
        current_source_frame_prompt_weight=current_source_frame_prompt_weight,
        previous_generated_frame_prompt_weight=previous_generated_frame_prompt_weight,
        generated_frame_init_blend=generated_frame_init_blend)

# Use a wrapper for FFMPEG to encode the video.
generated_video_no_audio=os.path.join('output','output_no_audio.mp4')
video_tools.encode_video(output_file=generated_video_no_audio,
        metadata=text_prompts,
        output_framerate=output_framerate,
        assumed_input_framerate=input_video_extraction_framerate)

# Copy audio from the original file
if copy_audio:
        video_tools.copy_video_audio(input_video_path, generated_video_no_audio, output_video_path)
        os.remove(generated_video_no_audio)
else:
        os.rename(generated_video_no_audio,final_output_filename)

# Download video
from google.colab import files
files.download(output_video_path)

input_video.MOV
Extracting image frames from original video


Style Transfer:   0%|          | 0/151 [00:00<?, ?image/s]