In [111]:
import datetime as dt
import warnings
from pathlib import Path

import ffmpeg
import IPython.display as ipd
import joblib as jl
import numpy as np
import soundfile as sf
import torch
from tqdm.auto import tqdm

from diff_ttsg.hifigan.config import v1
from diff_ttsg.hifigan.denoiser import Denoiser
from diff_ttsg.hifigan.env import AttrDict
from diff_ttsg.hifigan.models import Generator as HiFiGAN
from diff_ttsg.models.diff_ttsg import Diff_TTSG
from diff_ttsg.text import cmudict, sequence_to_text, text_to_sequence
from diff_ttsg.text.symbols import symbols
from diff_ttsg.utils.model import denormalize
from diff_ttsg.utils.utils import intersperse
from pymo.preprocessing import MocapParameterizer
from pymo.viz_tools import render_mp4
from pymo.writers import BVHWriter

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [87]:
DIFF_TTSG_CHECKPOINT = "logs/train/dev/runs/2023-06-26_08-26-26/checkpoints/epoch_epoch=021.ckpt"
HIFIGAN_CHECKPOINT = "g_02500000"
MOTION_PIPELINE = "diff_ttsg/resources/data_pipe.expmap_86.1328125fps.sav"
CMU_DICT_PATH = "diff_ttsg/resources/cmu_dictionary"


OUTPUT_FOLDER = "synth_output"

## Load Model

In [9]:
def load_model(checkpoint_path):
    model = Diff_TTSG.load_from_checkpoint(checkpoint_path, map_location=device)
    model.eval()
    return model
model = load_model(DIFF_TTSG_CHECKPOINT)

## Load vocoder (HiFi-GAN)

In [66]:
def load_vocoder(checkpoint_path):
    h = AttrDict(v1)
    hifigan = HiFiGAN(h).to(device)
    hifigan.load_state_dict(torch.load(checkpoint_path, map_location=device)['generator'])
    _ = hifigan.eval()
    hifigan.remove_weight_norm()
    return hifigan

vocoder = load_vocoder(HIFIGAN_CHECKPOINT)
denoiser = Denoiser(vocoder, mode='zeros')

Removing weight norm...


## Setup text preprocessing

In [53]:
cmu = cmudict.CMUDict(CMU_DICT_PATH)
def process_text(text: str, phonetising_dict):
    x = torch.LongTensor(intersperse(text_to_sequence(text, dictionary=phonetising_dict), len(symbols))).to(device)[None]
    x_lengths = torch.LongTensor([x.shape[-1]]).cuda()
    x_phones = sequence_to_text(x.squeeze(0).tolist())
    return {
        'x_orig': text,
        'x': x,
        'x_lengths': x_lengths,
        'x_phones': x_phones
    }

## Setup motion visualisation

In [83]:
motion_pipeline = jl.load(MOTION_PIPELINE)
bvh_writer = BVHWriter()
mocap_params = MocapParameterizer("position")

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


## text to synthesise

In [54]:
texts = [
    "And the train stopped , The door opened . I got out first , then Jack Kane got out , Ronan got out , Louise got out ."
]

## Hparams

In [55]:
## Number of timesteps to run the reverse denoising process
n_timesteps = {
    'mel': 50,
    'motion': 500,
}

## Changes to the speaking rate
length_scale=1.15

## Sampling temperature
temperature = {
    'mel': 1.3,
    'motion': 1.5
}

In [116]:
@torch.inference_mode()
def synthesise(text):
    text_processed = process_text(text, cmu)
    t = dt.datetime.now()
    output = model.synthesise(
        text_processed['x'], 
        text_processed['x_lengths'],
        n_timesteps=n_timesteps,
        temperature=temperature,
        stoc=False,
        spk=None,
        length_scale=length_scale
    )

    t = (dt.datetime.now() - t).total_seconds()
    print(f'RTF: {t * 22050 / (output["mel"].shape[-1] * 256)}')

    output.update(text_processed) # merge everything to one dict    
    return output

@torch.inference_mode()
def to_waveform(mel, vocoder):
    audio = vocoder(mel).clamp(-1, 1)
    audio = denoiser(audio.squeeze(0)).cpu().squeeze()
    return audio


def to_bvh(motion):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        return motion_pipeline.inverse_transform([motion.cpu().squeeze(0).T])
    
    
def save_to_folder(filename: str, output: dict, folder: str):
    folder = Path(folder)
    folder.mkdir(exist_ok=True, parents=True)
    np.save(folder / f'{filename}', output['mel'].cpu().numpy())
    sf.write(folder / f'{filename}.wav', output['waveform'], 22050, 'PCM_24')
    with open(folder / f'{filename}.bvh', 'w') as f:
        bvh_writer.write(output['bvh'], f)
        
        
def to_stick_video(filename, bvh, folder):
    folder = Path(folder)
    folder.mkdir(exist_ok=True, parents=True)

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        X_pos = mocap_params.fit_transform([bvh])
    print(f"rendering {filename} ...")
    render_mp4(X_pos[0], folder / f'{filename}.mp4', axis_scale=200)
    
    
def combine_audio_video(filename: str, folder: str):
    print("Combining audio and video")
    folder = Path(folder)
    folder.mkdir(exist_ok=True, parents=True)

    input_video = ffmpeg.input(str(folder / f'{filename}.mp4'))
    input_audio = ffmpeg.input(str(folder / f'{filename}.wav'))
    output_filename = folder / f'{filename}_audio.mp4'
    ffmpeg.concat(input_video, input_audio, v=1, a=1).output(str(output_filename)).run(overwrite_output=True)
    print(f"Final output with audio: {output_filename}")

## Generate

In [117]:
outputs = []
for i, text in enumerate(tqdm(texts)):
    print(f"{'=' * 25} {i} {'=' * 25}")
    output = synthesise(text)
    print(f"{'*' * 53}")
    print(f"Input text")
    print(f"{'-' * 53}")
    print(output['x_orig'])
    print(f"{'*' * 53}")
    print(f"Phonetised text - {i}")
    print(f"{'-' * 53}")
    print(output['x_phones'])
    print(f"{'*' * 53}")
    output['waveform'] = to_waveform(output['mel'], vocoder)
    ipd.display(ipd.Audio(output['waveform'], rate=22050))
    output['bvh'] = to_bvh(output['motion'])[0]
    outputs.append(output)
    save_to_folder(i, output, OUTPUT_FOLDER)

  0%|          | 0/1 [00:00<?, ?it/s]

RTF: 1.2735529054894912
*****************************************************
Input text
-----------------------------------------------------
And the train stopped , The door opened . I got out first , then Jack Kane got out , Ronan got out , Louise got out .
*****************************************************
Phonetised text - 0
-----------------------------------------------------
{AH0 N D} {DH AH0} {T R EY1 N} {S T AA1 P T} , {DH AH0} {D AO1 R} {OW1 P AH0 N D} . {AY1} {G AA1 T} {AW1 T} {F ER1 S T} , {DH EH1 N} {JH AE1 K} {K EY1 N} {G AA1 T} {AW1 T} , {R OW1 N AH0 N} {G AA1 T} {AW1 T} , {L UW0 IY1 Z} {G AA1 T} {AW1 T} .
*****************************************************


## Generate stick figure visualisations
(This will take some time)

In [118]:
for i, output in enumerate(tqdm(outputs)):
    to_stick_video(i, output['bvh'], OUTPUT_FOLDER)
    combine_audio_video(i, OUTPUT_FOLDER)

  0%|          | 0/1 [00:00<?, ?it/s]

MocapParameterizer: position
rendering 0 ...


ffmpeg version 4.2.7-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 9 (Ubuntu 9.4.0-1ubuntu1~20.04.1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-avresample --disable-filter=resample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librsvg --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --e

Final output with audio: synth_output/0_audio.mp4


frame=  789 fps=438 q=-1.0 Lsize=     559kB time=00:00:06.59 bitrate= 693.9kbits/s speed=3.66x    
video:481kB audio:64kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 2.497756%
[aac @ 0x5622019f9780] Qavg: 4988.765
[libx264 @ 0x562201a85800] frame I:4     Avg QP:15.90  size: 47614
[libx264 @ 0x562201a85800] frame P:208   Avg QP:22.62  size:   570
[libx264 @ 0x562201a85800] frame B:577   Avg QP:27.05  size:   318
[libx264 @ 0x562201a85800] consecutive B-frames:  1.0%  3.0%  4.2% 91.8%
[libx264 @ 0x562201a85800] mb I  I16..4: 45.7% 28.4% 25.9%
[libx264 @ 0x562201a85800] mb P  I16..4:  0.0%  0.0%  0.0%  P16..4:  0.2%  0.2%  0.3%  0.0%  0.0%    skip:99.4%
[libx264 @ 0x562201a85800] mb B  I16..4:  0.0%  0.0%  0.0%  B16..8:  0.1%  0.2%  0.2%  direct: 0.1%  skip:99.4%  L0:51.3% L1:42.3% BI: 6.4%
[libx264 @ 0x562201a85800] 8x8 transform intra:28.2% inter:22.0%
[libx264 @ 0x562201a85800] coded y,uvDC,uvAC intra: 22.1% 2.4% 2.3% inter: 0.2% 0.2% 0.2%
[libx264 @ 0x562201a858