<a href="https://colab.research.google.com/github/saharmor/anima/blob/main/anima_turn_text_to_video.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<center>
<img src="https://emojipedia-us.s3.dualstack.us-west-1.amazonaws.com/thumbs/240/apple/325/sparkles_2728.png" width="120">
</center>

# <center>Anima - Turn text into video</center>
###<center>Generate videos from YouTube videos using OpenAI's Whisper, Stable Diffusion, and Google FILM</center>

#### <center> [Github Repository](https://github.com/saharmor/anima) </center>
##### <center> By [Sahar Mor](https://twitter.com/theaievangelist) and [Abhay Kashyap](https://twitter.com/hayabhay) </center>



In [None]:
#@markdown # Setup Workspace

In [None]:
#@markdown ## 1. Installing pip dependecies
%pip install --quiet --upgrade diffusers transformers ffmpeg accelerate

In [None]:
#@markdown ## 2. Hugging Face (HF) Setup for Stable Diffusion
#@markdown Instructions for creating a HF account and token https://huggingface.co/docs/huggingface_hub/quick-start#login

#@markdown 👉 **IMPORTANT** Agree to Runway's ToS on HF https://huggingface.co/runwayml/stable-diffusion-v1-5
!git config --global credential.helper store
!huggingface-cli login

In [None]:
#@markdown ## 3. FILM setup
#@markdown ### 3.1. Google Drive setup

from pathlib import Path
import os

drive_mounted = False
gdrive_fpath = '.'
local_path = '/content/'

####################

#@markdown Mounting your google drive is optional.
#@markdown If you mount your drive, code and models will be downloaded to it.
#@markdown This should reduce setup time after your first run.

###################

# Optionally Mount GDrive

mount_gdrive = False # @param{type:"boolean"}
if mount_gdrive and not drive_mounted:
    from google.colab import drive

    gdrive_mountpoint = '/content/drive/' #@param{type:"string"}
    gdrive_subdirectory = 'MyDrive/interpolation' #@param{type:"string"}
    gdrive_fpath = str(Path(gdrive_mountpoint) / gdrive_subdirectory)
    try:
        drive.mount(gdrive_mountpoint, force_remount = True)
        !mkdir -p {gdrive_fpath}
        %cd {gdrive_fpath}
        local_path = gdrive_fpath
        drive_mounted = True
    except OSError:
        print(
            "\n\n-----[PYTTI-TOOLS]-------\n\n"
            "If you received a scary OSError and your drive"
            " was already mounted, ignore it."
            "\n\n-----[PYTTI-TOOLS]-------\n\n"
            )
        raise



if not Path('./frame-interpolation').exists():
    !git clone https://github.com/pytti-tools/frame-interpolation

try:
    import frame_interpolation
except ModuleNotFoundError:
    %pip install -r ./frame-interpolation/requirements_colab.txt
    %pip install ./frame-interpolation

#url = "https://drive.google.com/drive/folders/1GhVNBPq20X7eaMsesydQ774CgGcDGkc6?usp=sharing"
share_id = "1GhVNBPq20X7eaMsesydQ774CgGcDGkc6" # Google FILM files

if not (Path(local_path) / 'saved_model').exists():
    !pip install --upgrade gdown
    !gdown --folder {share_id}

# create default frame
!mkdir -p frames

In [None]:
#@markdown ### 3.2. Google FILM setup

#@markdown Specify the local directory containing your video frames with the `frames_dir` parameter.

frames_dir = "frames" #@param{'type':'string'}

#@markdown A single pass of the interpolation procedure adds a frame between each contiguous pair of frames in `frames_dir`.

#@markdown If you start with $n$ frames in `frames_dir` and set `recursive_interpolation_passes` to $k$, your total number of frames
#@markdown after interpolation will be: 
#@markdown $$2^k (n-1) -1$$

import math
film_smoothing_frames = 8 #@param{'type':'integer'}
recursive_interpolation_passes = int(math.log2(8))

#@markdown ---

In [None]:
#@markdown ## 4. OpenAI Whisper setup
#@markdown ### 4.1. Installing dependecies
#@markdown Whisper will be used to turn YouTube videos into transcribed prompts.

#@markdown This cell will take a little while to download several libraries, including Whisper.

! pip install git+https://github.com/openai/whisper.git
! pip install pytube

import sys
import warnings
import whisper
from pathlib import Path
import pytube
import subprocess
import torch
import shutil
import numpy as np
import ffmpeg
from IPython.display import display, Markdown, YouTubeVideo

device = torch.device('cuda:0')
print('Using device:', device, file=sys.stderr)

In [None]:
#@markdown ### 4.2. Whisper Model selection

Model = 'small.en' #@param ['tiny.en', 'tiny', 'base.en', 'base', 'small.en', 'small', 'medium.en', 'medium', 'large']

whisper_model = whisper.load_model(Model)

if Model in whisper.available_models():
    display(Markdown(
        f"**{Model} model is selected.**"
    ))
else:
    display(Markdown(
        f"**{Model} model is no longer available.**<br /> Please select one of the following:<br /> - {'<br /> - '.join(whisper.available_models())}"
    ))

In [None]:
#@markdown ---

In [None]:
#@markdown # Let the magic begin 👇

In [None]:
#@markdown ## 1. Select YouTube video to transcribe

#@markdown Enter the URL of the Youtube video you want to transcribe. A 15-seconds video will take ~8 minutes to generate.

URL = "https://www.youtube.com/watch?v=t0imaSCnSuA" #@param {type:"string"}
start_time = 119 #@param {type:"integer"}
duration = 15 #@param {type:"integer"}
video_yt = pytube.YouTube(URL)

try:
    video_yt.check_availability()
    display(
        YouTubeVideo(video_yt.video_id)
    )
except pytube.exceptions.VideoUnavailable:
    display(
        Markdown(f"**{URL} isn't available.**"),
    )
    raise(RuntimeError(f"{URL} isn't available."))

import datetime
file_name = f'{video_yt.video_id}_{str(datetime.datetime.now())}.wav'
video_path_local = f'{Path(".")}//{file_name}'
video_yt.streams.get_by_itag(140).download('.', file_name)

output_trimed = video_path_local.split(".")[0]+"_trimmed.wav"
audio_trimmed = ffmpeg.input(str(video_path_local), ss=start_time, t=duration)
audio_trimmed_converted = ffmpeg.output(audio_trimmed, output_trimed)
ffmpeg.run(audio_trimmed_converted, overwrite_output=True)
video_path_local = output_trimed

In [None]:
#@markdown ### Transcribe video into prompts
#@markdown This can take a while and very based on the length of the video and the number of parameters of the model selected above.


#@markdown --
#@markdown #### Configure Whisper (optional)
#@markdown ---
language = "English" #@param ['Auto detection', 'Afrikaans', 'Albanian', 'Amharic', 'Arabic', 'Armenian', 'Assamese', 'Azerbaijani', 'Bashkir', 'Basque', 'Belarusian', 'Bengali', 'Bosnian', 'Breton', 'Bulgarian', 'Burmese', 'Castilian', 'Catalan', 'Chinese', 'Croatian', 'Czech', 'Danish', 'Dutch', 'English', 'Estonian', 'Faroese', 'Finnish', 'Flemish', 'French', 'Galician', 'Georgian', 'German', 'Greek', 'Gujarati', 'Haitian', 'Haitian Creole', 'Hausa', 'Hawaiian', 'Hebrew', 'Hindi', 'Hungarian', 'Icelandic', 'Indonesian', 'Italian', 'Japanese', 'Javanese', 'Kannada', 'Kazakh', 'Khmer', 'Korean', 'Lao', 'Latin', 'Latvian', 'Letzeburgesch', 'Lingala', 'Lithuanian', 'Luxembourgish', 'Macedonian', 'Malagasy', 'Malay', 'Malayalam', 'Maltese', 'Maori', 'Marathi', 'Moldavian', 'Moldovan', 'Mongolian', 'Myanmar', 'Nepali', 'Norwegian', 'Nynorsk', 'Occitan', 'Panjabi', 'Pashto', 'Persian', 'Polish', 'Portuguese', 'Punjabi', 'Pushto', 'Romanian', 'Russian', 'Sanskrit', 'Serbian', 'Shona', 'Sindhi', 'Sinhala', 'Sinhalese', 'Slovak', 'Slovenian', 'Somali', 'Spanish', 'Sundanese', 'Swahili', 'Swedish', 'Tagalog', 'Tajik', 'Tamil', 'Tatar', 'Telugu', 'Thai', 'Tibetan', 'Turkish', 'Turkmen', 'Ukrainian', 'Urdu', 'Uzbek', 'Valencian', 'Vietnamese', 'Welsh', 'Yiddish', 'Yoruba']
#@markdown > Language spoken in the audio, use `Auto detection` to let Whisper detect the language.

#@markdown ---
suppress_tokens = "-1" #@param {type:"string"}
#@markdown > Comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations.

#@markdown ---
initial_prompt = "" #@param {type:"string"}
#@markdown > Optional text to provide as a prompt for the first window.

#@markdown ---
fp16 = True #@param {type:"boolean"}
#@markdown > whether to perform inference in fp16.

#@markdown ---
no_speech_threshold = 0.6 #@param {type:"slider", min:-0.0, max:1, step:0.05}
#@markdown > If the probability of the <|nospeech|> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence.

#@markdown ---
verbose = 'Live transcription' #@param ['Live transcription', 'Progress bar', 'None']

verbose_lut = {
    'Live transcription': True,
    'Progress bar': False,
    'None': None
}

temperature = 0.2
temperature_increment_on_fallback = 0.2
best_of = 5
condition_on_previous_text=True

args = dict(
    language = (None if language == "Auto detection" else language),
    verbose = verbose_lut[verbose],
    temperature_increment_on_fallback = temperature_increment_on_fallback,
    best_of = best_of,
    suppress_tokens=suppress_tokens,
    initial_prompt=(None if not initial_prompt else initial_prompt),
    condition_on_previous_text=condition_on_previous_text,
    fp16=fp16,
    no_speech_threshold=no_speech_threshold
)

temperature_increment_on_fallback = args.pop("temperature_increment_on_fallback")
if temperature_increment_on_fallback is not None:
    temperature = tuple(np.arange(temperature, 1.0 + 1e-6, temperature_increment_on_fallback))
else:
    temperature = [temperature]

if Model.endswith(".en") and args["language"] not in {"en", "English"}:
    warnings.warn(f"{Model} is an English-only model but receipted '{args['language']}'; using English instead.")
    args["language"] = "en"

print("Extracted text:")
video_transcription = whisper.transcribe(
    whisper_model,
    str(video_path_local),
    temperature=temperature,
    **args,
)

audio_length = float(ffmpeg.probe(f"{video_path_local}")["format"]["duration"])

In [None]:
#@markdown ## Turn Whisper's transcription into prompts

prompt_suffix = "Happy, colorful, hyper-realistic. Highly detailed. High quality." #@param {type:"string"}
#@markdown > Text to append to all prompts. This text will ensure the generated images have the same style. Change this text based on the theme of your generated video. You can drive inspiration from https://lexica.art/ 

#@markdown ---
start_prompt = "A huge church, from the inside" #@param {type:"string"}
#@markdown > Optional. The opening frame, probably related to the first few words or scenery. Leave empty to ignore.

#@markdown ---
end_prompt = "graceful death" #@param {type:"string"}
#@markdown > Optional. The last frame, probably related to the last few words or scenery. Use the same start_prompt for a boomerang effect. Leave empty to ignore.

In [None]:
#@markdown ## Align prompsts with video timing

prompts = [] 
for segment in video_transcription['segments']:
  segment_text = segment['text'].strip()
  if segment_text[0] == "[" and segment_text[-1] == "]":
      continue
  segment_start = segment['start']
  segment_end = min(round(audio_length), segment['end'])
  segment_midpoint = round(segment_start + (segment_end - segment_start)/2)

  prompts.append(
      {
          "prompt": segment_text,
          "ts": segment_midpoint
      }
  )

if start_prompt:
  prompts.insert(0, 
      {  
          "prompt": start_prompt,
          "ts": 0
  })
  
if end_prompt:
  prompts.append( 
      {  
          "prompt": end_prompt,
          "ts": round(audio_length)
  })


  for prompt in prompts:
    if prompt['prompt'][:-1] == '.':
      prompt['prompt'] += " Image style: " + prompt_suffix
    else:
      prompt['prompt'] += ". Image style: " + prompt_suffix

prompts

In [113]:
#@markdown # Generate images with Stable Diffusion
#@markdown ## Download Stable Diffusion

#@markdown **IMPORTANT** in case of a 403 error - approve Runway's ToS on https://huggingface.co/runwayml/stable-diffusion-v1-5

from stable_diffusion_animation_pipeline import StableDiffusionAnimationPipeline
from generation_utils import make_scheduler

# Load the animation pipeline
pipe = StableDiffusionAnimationPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
    scheduler=make_scheduler(100),  # timesteps is arbitrary at this point
    revision="fp16",
    torch_dtype=torch.float16,
).to("cuda")  

Fetching 15 files:   0%|          | 0/15 [00:00<?, ?it/s]

ftfy or spacy is not installed using BERT BasicTokenizer instead of ftfy.


In [None]:
#@markdown ## Stable Diffusion configuration

guidance_scale = 7.5 #@param {type:"number"}
num_inference_steps = 35 #@param {type:"number"}
seed = 1222 #@param {type:"number"}
prompt_strength = 0.65 #@param {type:"slider", min:0, max: 1, step:0.01}
fps= 24 #@param{'type':'number'}
height = 512 #@param {type:"number"}
width = 512 #@param {type:"number"}

In [None]:
#@markdown ## Generate frames based on prompts

from generation_utils import save_pil_image, slerp

with torch.autocast("cuda"), torch.inference_mode():
  for i, prompt in enumerate(prompts[:-1]):
      start_prompt = prompt
      end_prompt = prompts[i+1]

      # NOTE: To smooth out edge, the last "n" frames MUST be interpolated by FILM
      # This is because edges between two segments are denoised from different midpoints
      # and can have a jagged edge jumping between the start & end.
      latents_start_frame_num = int(start_prompt["ts"]*fps)
      # latents_end_frame_num = int(end_prompt["ts"]*fps) - edge_smoothing_frames
      latents_end_frame_num = int(end_prompt["ts"]*fps)

      # Get frame difference to generate
      frames_needed = latents_end_frame_num - latents_start_frame_num
      # Get number of intermediate frames to be diffused based on smoothing frames
      num_intermediate_frames = int(frames_needed/film_smoothing_frames) - 1

      batch_size = 1

      print(f"Latent edge frames: {latents_start_frame_num} - {latents_end_frame_num}")
      print(f"Intermediate frames needed: {num_intermediate_frames}")

      # Get start & end embeddings for prompts
      do_classifier_free_guidance = guidance_scale > 1.0
      text_embeddings_start = pipe.embed_text(
        start_prompt['prompt'], do_classifier_free_guidance, batch_size
      )
      text_embeddings_end = pipe.embed_text(
        end_prompt['prompt'], do_classifier_free_guidance, batch_size
      )
      print(text_embeddings_start.shape)

      # Initialize with current start embedding as current
      text_embeddings_current = text_embeddings_start

      # Generate all intermediate frames and write all images to disk
      print(f"Generating {num_intermediate_frames} intermediate frames")
      for i in range(num_intermediate_frames + 1):
        # For each prompt pair, create images & intermediate frames
        generator = torch.Generator("cuda").manual_seed(seed)

        # Generate initial latents to start to generate animation frames from
        initial_scheduler = pipe.scheduler = make_scheduler(
            num_inference_steps
        )

        num_initial_steps = int(num_inference_steps * (1 - prompt_strength))

        print(f"Generating initial latents for {num_initial_steps} steps")
        initial_latents = torch.randn(
            (batch_size, pipe.unet.in_channels, height // 8, width // 8),
            generator=generator,
            device="cuda",
        )

        step_fraction = i / (num_intermediate_frames + 1)

        # Get next text embedding point 
        text_embeddings_next = slerp(
                step_fraction,
                text_embeddings_start,
                text_embeddings_end,
        )
        # Get midpoint between current and next
        text_embeddings_mid = slerp(0.5, text_embeddings_current, text_embeddings_next)

        # Get that midpoint's latents
        latents_mid = pipe.denoise(
            latents=initial_latents,
            text_embeddings=text_embeddings_mid,
            t_start=1,
            t_end=num_initial_steps,
            guidance_scale=guidance_scale,
        )    
        frame_number = int(latents_start_frame_num + (step_fraction * frames_needed))
        print(f"Step fraction: {step_fraction}. Frame number: {frame_number}")

        # re-initialize scheduler
        pipe.scheduler = make_scheduler(num_inference_steps, initial_scheduler)

        latents = pipe.denoise(
            latents=latents_mid,
            text_embeddings=text_embeddings_current,
            t_start=num_initial_steps,
            t_end=None,
            guidance_scale=guidance_scale,
        )

        # Save all anchor latents to disk
        image = pipe.latents_to_image(latents)
        save_pil_image(
            pipe.numpy_to_pil(image)[0], path=f"{frames_dir}/{frame_number}".zfill(5) + ".png"
        )      

        text_embeddings_current = text_embeddings_next

In [None]:
# clean GPU memory
del pipe
torch.cuda.empty_cache()

In [None]:
#@markdown # Turn images into a video using Google FILM

!python -m frame_interpolation.eval.interpolator_cli \
      --model_path ./saved_model \
      --pattern {frames_dir} \
      --fps {fps} \
      --times_to_interpolate {recursive_interpolation_passes} \
      --output_video

In [None]:
#@markdown # Combine generated video with the original audio
!rm generated_video.mp4
output_interpolated = f'{frames_dir}/interpolated.mp4'
!ffmpeg -i {output_interpolated} -i {video_path_local} -c:v copy -c:a aac generated_video.mp4

In [None]:
#@markdown # Your video is ready - open the files menu to your left and download `generated_video.mp4`