# This notebooks shows how to use components to make a video

In [1]:
import sys
sys.path.append('../') # so that we can import packages from parent folder

from src.video_composition.video.static_background_image_component import StaticBackgroundImageComponent
from src.video_composition.composer import VideoComposition
from src.video_composition.video.text_component import TextComponent
from src.video_composition.video.timed_text_component import TimedTextComponent
from src.video_composition.audio.audio_component import AudioComponent
from src.video_composition.audio.audio_file_component import AudioFileComponent
from src.video_composition.combo.component_container import ComponentContainer
from src.video_composition.combo.mp4_component import MP4Component

from src.video_composition.combo.tts_component import TTSComponent

background_start_time = 2  # Start at 2 seconds into the video
background_duration = 10  # Last for 10 seconds
text_start_time = 4  # Start at 4 seconds into the video
text_duration = 5  # Last for 5 seconds

# HD phone resolution
# width = 1920
# height = 1080

width = 1024
height = 1024

fps = 30
font = 70
output_file = 'output/composition.mp4'
image = './images/image.png'

 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 > initialization of speaker-embedding layers.


In [2]:
video_composition = VideoComposition(width=width, height=height, fps=fps)
video_composition.add_video_component(StaticBackgroundImageComponent(image, width=width, height=height, fps=fps, start_time=background_start_time, duration=background_duration))
video_composition.add_video_component(TextComponent(text="Hello", font=font, width=width, height=height, fps=fps, start_time=text_start_time, duration=text_duration))
video_composition.create_video(output_file=output_file)

Component: 60 - 360
Duration: 10
Component: 120 - 270
Duration: 5


In [6]:
from IPython.display import Audio
import contextlib

# text = "Hello world. This is a test of, the text to speech system.  Here's a third sentence. And a fourth. How about a fifth?"
text = "Hello world? Would you move here?"
output = None
with contextlib.redirect_stdout(None):
    from src.tts import TextToSpeech

    output = TextToSpeech.text_to_audio(text, speaker_name="p227", speed=.6)

Audio(output.audio, rate=22050)
# 5 or 10 seconds

In [7]:
output_file = "output/tts.mp4"
background_start_time = 1

wav = "./music/bach.wav"

words = text.split()
duration = output.total_running_time_s
start = 2
timestamps = [time / 86 + start for time in output.word_timestamps]

video_composition = VideoComposition(width=width, height=height, fps=fps)
video_composition.add_video_component(StaticBackgroundImageComponent(image, width=width, height=height, fps=fps, start_time=background_start_time, duration=background_duration))
video_composition.add_video_component(TimedTextComponent(words, timestamps, width=width, height=height, fps=fps, start_time=start, duration=duration))
video_composition.add_audio_component(AudioFileComponent(wav, start_time=0))
video_composition.add_audio_component(AudioComponent(output.audio*5, start_time=start))
video_composition.create_video(output_file=output_file)

Component: 30 - 330
Duration: 10
Component: 60 - 244
Duration: 6.15546485260771


In [8]:
output_file = "output/container-test.mp4"
ttc = TimedTextComponent(words, timestamps, start_time=1, duration=output.total_running_time_s, width=width, height=height, fps=fps)
ac = AudioComponent(output.audio * 4, start_time=1)
mc = AudioFileComponent(wav, start_time=1)

container = ComponentContainer()
container.add_component(ttc)
container.add_component(ac)
container.add_component(mc)

video_composition = VideoComposition(width=width, height=height, fps=fps)
video_composition.add_component(container)
video_composition.create_video(output_file)

Component: 30 - 214
Duration: 6.15546485260771


In [9]:
video_composition = VideoComposition(width=1080, height=1920, fps=fps)
video_composition.add_component(TTSComponent("Testing a component containing text and sound.  It should be easy to use.", fps=fps, color=(255, 0, 0)))
video_composition.add_component(TTSComponent("You should be able to stagger sentences.", fps=fps, start=4, color=(0, 255, 0)))
video_composition.create_video(output_file)

 > Text splitted to sentences.
['Testing a component containing text and sound']
 > Processing time: 0.863832950592041
 > Real-time factor: 0.2530222709956762
 > Text splitted to sentences.
['.']
 > Processing time: 0.15203189849853516
 > Real-time factor: 0.2044087415788232
 > Text splitted to sentences.
['It should be easy to use.']
 > Processing time: 0.5270950794219971
 > Real-time factor: 0.22248174772693405
Component: 0 - 195
Duration: 6.526984126984127
 > Text splitted to sentences.
['You should be able to stagger sentences.']
 > Processing time: 0.6851997375488281
 > Real-time factor: 0.2209901447014928
Component: 120 - 213
Duration: 3.1005895691609977


In [11]:
video_composition = VideoComposition(width=1024, height=1024, fps=fps)
mp4_path = "./output/tts.mp4"
vc = MP4Component(mp4_path=mp4_path, start=1, width=1024, height=1024, fps=fps)
video_composition.add_component(vc)
video_composition.create_video("mp4.mp4")


ffmpeg version 6.0 Copyright (c) 2000-2023 the FFmpeg developers
  built with Apple clang version 14.0.3 (clang-1403.0.22.14.1)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/6.0 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags= --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex --enable-libsoxr --enable-libzmq --enable-libzimg --disable-libjack --di

Component: 30 - 360
Duration: 11.0
