In [1]:
import datetime as dt
import warnings
from pathlib import Path

import ffmpeg
import IPython.display as ipd
import joblib as jl
import numpy as np
import soundfile as sf
import torch
from tqdm.auto import tqdm

from diff_ttsg.hifigan.config import v1
from diff_ttsg.hifigan.denoiser import Denoiser
from diff_ttsg.hifigan.env import AttrDict
from diff_ttsg.hifigan.models import Generator as HiFiGAN
from diff_ttsg.models.diff_ttsg import Diff_TTSG
from diff_ttsg.text import cmudict, sequence_to_text, text_to_sequence
from diff_ttsg.text.symbols import symbols
from diff_ttsg.utils.model import denormalize
from diff_ttsg.utils.utils import intersperse
from pymo.preprocessing import MocapParameterizer
from pymo.viz_tools import render_mp4
from pymo.writers import BVHWriter

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [9]:
%ls logs/train/IPA_RUN/runs/2023-08-07_10-25-49/checkpoints/

'epoch_epoch=2754.ckpt'  'epoch_epoch=2756.ckpt'  'epoch_epoch=2758.ckpt'
'epoch_epoch=2755.ckpt'  'epoch_epoch=2757.ckpt'


In [10]:
DIFF_TTSG_CHECKPOINT = "logs/train/IPA_RUN/runs/2023-08-07_10-25-49/checkpoints/epoch_epoch=2758.ckpt"
HIFIGAN_CHECKPOINT = "g_02500000"
MOTION_PIPELINE = "diff_ttsg/resources/data_pipe.expmap_86.1328125fps.sav"
CMU_DICT_PATH = "diff_ttsg/resources/cmu_dictionary"


OUTPUT_FOLDER = "synth_output"

## Load Model

In [11]:
def load_model(checkpoint_path):
    model = Diff_TTSG.load_from_checkpoint(checkpoint_path, map_location=device)
    model.eval()
    return model
model = load_model(DIFF_TTSG_CHECKPOINT)

## Load vocoder (HiFi-GAN)

In [12]:
def load_vocoder(checkpoint_path):
    h = AttrDict(v1)
    hifigan = HiFiGAN(h).to(device)
    hifigan.load_state_dict(torch.load(checkpoint_path, map_location=device)['generator'])
    _ = hifigan.eval()
    hifigan.remove_weight_norm()
    return hifigan

vocoder = load_vocoder(HIFIGAN_CHECKPOINT)
denoiser = Denoiser(vocoder, mode='zeros')

Removing weight norm...


## Setup text preprocessing

In [13]:
cmu = cmudict.CMUDict(CMU_DICT_PATH)
def process_text(text: str):
    x = torch.LongTensor(intersperse(text_to_sequence(text, cleaner_names=['english_cleaners2']), 0)).to(device)[None]
    x_lengths = torch.LongTensor([x.shape[-1]]).cuda()
    x_phones = sequence_to_text(x.squeeze(0).tolist())
    return {
        'x_orig': text,
        'x': x,
        'x_lengths': x_lengths,
        'x_phones': x_phones
    }

## Setup motion visualisation

In [14]:
motion_pipeline = jl.load(MOTION_PIPELINE)
bvh_writer = BVHWriter()
mocap_params = MocapParameterizer("position")

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


## text to synthesise

In [15]:
texts = [
    "And the train stopped , The door opened . I got out first , then Jack Kane got out , Ronan got out , Louise got out ."
]

## Hparams

In [16]:
## Number of timesteps to run the reverse denoising process
n_timesteps = {
    'mel': 50,
    'motion': 500,
}

## Changes to the speaking rate
length_scale=1.0

## Sampling temperature
temperature = {
    'mel': 1.3,
    'motion': 1.5
}

In [17]:
transcripts = {
  "C3_7_eval_0732.wav":
    "But then it was annoying because I paid because you have to pay the hospital fee of like a hundred quid for, for being seen and all the tests and stuff done and then a receipt was sent to my house.",
  "C4_3_eval_0092.wav":
    "I mean it it's not that I'm against it it's just that I just don't have the time and I just sometimes I'm not bothered and that sort of stuff.",
  "C4_3_eval_0126.wav":
    "Like every, I think most people even people who never go to mass ever will go to mass on Christmas Eve or Christmas Day. So like what we used to do is we used to go to mass on Christmas Eve which was lovely it's such a nice ceremony because it's so like it's obviously 12 o'clock at night.",
  "C3_7_eval_0876.wav":
    "But, but then I ended up in hospital for like three days because they were doing loads of checks like MRIs and all that sort of stuff on my head because I kept on forgetting things and that sort of stuff and like.",
  "C3_7_eval_0447.wav":
    "If you like touched it, it was excruciatingly sore. And I went up to the teachers I was like look I'm after like really damaging my finger I might have to go to the doctors.",
  "C4_2_eval_0059.wav":
    "Everything's measured off of likes and all that sort of stuff and how how people get valued these days if someone has this amount of likes they're obviously deemed great in society or looked upon as, as perfect.",
  "C3_7_eval_0047.wav":
    "But I remember once my parents were just downstairs in the kitchen and this is when mobile phones just began coming out. So, like my oldest brother and my oldest sister had a mobile phone each I'm pretty sure.",
  "C3_7_eval_1074.wav":
    "When you think about it, that you do as a child, it's just absolutely ridiculous that makes no sense. But you can always justify it back then because it just seemed like the fun right thing to do.",
  "C4_3_eval_0150.wav":
    "Yeah and then obviously there, there's certain choirs that come down to the church. There's a woman called, I can't remember her name. But she has an incredible voice. Like an amazing voice.",
  "C3_7_eval_0583.wav":
    "But moving on a few years, this is about maybe five six years ago myself and my friends went to Prague and then we came home and they live in Wexford.",
  "C3_5_eval_0005.wav":
    "Growing up and even today I used to get in a lot of trouble. I used to mess quite a bit when I was younger because again I was like the class Clown and I like to impress people through messing, such.",
  "C4_3_eval_0037.wav":
    "But yeah as I said before so obviously I work in a bar so Christmas Eve in Hennessey is just packed jammed insane. So myself and my brother will be working there this year.",
  "C4_2_eval_0039.wav":
    "Because you can actually you actually do feel the kind of the mental strains of social media and you know people depicting these perfect lives online and you're like oho.",
  "C4_2_eval_0011.wav":
    "Just so this whole social networking stuff just really really annoys me and cause it just warps people's minds and people are so Fixated on their phones and that sort of stuff that I just hate that so much.",
  "C3_7_eval_0163.wav":
    "And then a few weeks later after that my parents were away my granny was minding us and again I don't know why I told my brother to do this but I was like here.",
  "C4_1_eval_0251.wav":
    "I would like replenish stock I would bring up stock for the off-license that sort of stuff So I was doing all the kind of the menial kind of jobs like the kind of boring tedious work that someone had to do.",
  "C3_7_eval_0060.wav":
    "I don't think anybody else did well I definitely didn't anyway because I was young. And, well obviously my parents hated, hated when we were getting out of bed because obviously we had school the next day and we had to get our sleep.",
  "C4_2_eval_0137.wav":
    "You walk around Dublin city centre and even if you try and strike up a conversation with somebody it's impossible because everyone has their headphones in. And again, I would listen to podcasts sometimes with my headphones in walking around the streets.",
  "C3_5_eval_0043.wav":
    "When I was in primary school I used to have this ruler and I used to put it between desks and I used to push the tables together so the ruler would be between the two tables.",
  "C3_7_eval_0506.wav":
    "and they finished they they cleaned up the wound and stuff I stood up and I just collapsed onto the ground and fainted because I was completely drained of all my energy of of everything like it was absolutely. Oh, so so bad.",
  "C4_1_eval_0044.wav":
    "Kind of the best place to find real human emotion is in a pub because you see it all. You literally see people who are angry, sad, dazed, happy, depressed.",
  "C3_7_eval_0301.wav":
    "Eventually got to a point where I was like okay I need to stop doing this sort of stuff Like it just doesn't make any sense as to why because I was getting hurt like there was times where like, I was like tearing muscles and I never broke a bone which I'm pretty proud of.",
  "C4_2_eval_0521.wav":
    "Trying to see if if we can go back to the olden ways of just talking to people and actually engaging and communicating and seeing if can relationships form with just.",
  "C4_2_eval_0331.wav":
    "And because of chatting to people online throughout the years you do lose that ability to talk and the ability to just sporadically chat to anybody.",
  "C4_2_eval_0245.wav":
    "But and again so that doesn't help people like myself and my friend who actually want to strike up a conversation with a genuine person out in the open because we don't want to go online. We don't feel like we have to do that.",
}

texts = list(transcripts.values())
filenames = list(transcripts.keys())



In [20]:
@torch.inference_mode()
def synthesise(text):
    text_processed = process_text(text)
    t = dt.datetime.now()
    output = model.synthesise(
        text_processed['x'], 
        text_processed['x_lengths'],
        n_timesteps=n_timesteps,
        temperature=temperature,
        stoc=False,
        spk=None,
        length_scale=length_scale
    )

    t = (dt.datetime.now() - t).total_seconds()
    rtf = t * 22050 / (output["mel"].shape[-1] * 256)
    print(f'RTF: {rtf}')
    

    output.update(text_processed) # merge everything to one dict    
    output.update({'rtf': rtf})
    return output

@torch.inference_mode()
def to_waveform(mel, vocoder):
    audio = vocoder(mel).clamp(-1, 1)
    audio = denoiser(audio.squeeze(0)).cpu().squeeze()
    return audio


def to_bvh(motion):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        return motion_pipeline.inverse_transform([motion.cpu().squeeze(0).T])
    
    
def save_to_folder(filename: str, output: dict, folder: str):
    folder = Path(folder)
    folder.mkdir(exist_ok=True, parents=True)
    np.save(folder / f'{filename}', output['mel'].cpu().numpy())
    sf.write(folder / f'{filename}.wav', output['waveform'], 22050, 'PCM_24')
    with open(folder / f'{filename}.bvh', 'w') as f:
        bvh_writer.write(output['bvh'], f)
        
        
def to_stick_video(filename, bvh, folder):
    folder = Path(folder)
    folder.mkdir(exist_ok=True, parents=True)

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        X_pos = mocap_params.fit_transform([bvh])
    print(f"rendering {filename} ...")
    render_mp4(X_pos[0], folder / f'{filename}.mp4', axis_scale=200)
    
    
def combine_audio_video(filename: str, folder: str):
    print("Combining audio and video")
    folder = Path(folder)
    folder.mkdir(exist_ok=True, parents=True)

    input_video = ffmpeg.input(str(folder / f'{filename}.mp4'))
    input_audio = ffmpeg.input(str(folder / f'{filename}.wav'))
    output_filename = folder / f'{filename}_audio.mp4'
    ffmpeg.concat(input_video, input_audio, v=1, a=1).output(str(output_filename)).run(overwrite_output=True)
    print(f"Final output with audio: {output_filename}")

## Generate

In [22]:
outputs, rtfs = [], []
for i, text in enumerate(tqdm(texts)):
    if i == 5:
        break
    print(f"{'=' * 25} {i} {'=' * 25}")
    output = synthesise(text)
    rtfs.append(output['rtf'])
    print(f"{'*' * 53}")
    print(f"Input text")
    print(f"{'-' * 53}")
    print(output['x_orig'])
    print(f"{'*' * 53}")
    print(f"Phonetised text - {i}")
    print(f"{'-' * 53}")
    print(output['x_phones'])
    print(f"{'*' * 53}")
    output['waveform'] = to_waveform(output['mel'], vocoder)
    ipd.display(ipd.Audio(output['waveform'], rate=22050))
    output['bvh'] = to_bvh(output['motion'])[0]
    outputs.append(output)
    save_to_folder(filenames[i], output, OUTPUT_FOLDER)

print(f"[Diff-TTSG] RTF: {np.mean(rtfs)} +/- {np.std(rtfs)}")

  0%|          | 0/25 [00:00<?, ?it/s]

RTF: 1.1374883304625982
*****************************************************
Input text
-----------------------------------------------------
But then it was annoying because I paid because you have to pay the hospital fee of like a hundred quid for, for being seen and all the tests and stuff done and then a receipt was sent to my house.
*****************************************************
Phonetised text - 0
-----------------------------------------------------
_b_ˌ_ʌ_t_ _ð_ˈ_ɛ_n_ _ɪ_t_ _w_ʌ_z_ _ɐ_n_ˈ_ɔ_ɪ_ɪ_ŋ_ _b_ɪ_k_ˈ_ʌ_z_ _ˈ_a_ɪ_ _p_ˈ_e_ɪ_d_ _b_ɪ_k_ˈ_ʌ_z_ _j_u_ː_ _h_æ_v_ _t_ə_ _p_ˈ_e_ɪ_ _ð_ə_ _h_ˈ_ɑ_ː_s_p_ɪ_ɾ_ə_l_ _f_ˈ_i_ː_ _ʌ_v_ _l_ˈ_a_ɪ_k_ _ɐ_ _h_ˈ_ʌ_n_d_ɹ_ɪ_d_ _k_w_ˈ_ɪ_d_ _f_ɔ_ː_ɹ_,_ _f_ɔ_ː_ɹ_ _b_ˌ_i_ː_ɪ_ŋ_ _s_ˈ_i_ː_n_ _æ_n_d_ _ˈ_ɔ_ː_l_ _ð_ə_ _t_ˈ_ɛ_s_t_s_ _æ_n_d_ _s_t_ˈ_ʌ_f_ _d_ˈ_ʌ_n_ _æ_n_d_ _ð_ˈ_ɛ_n_ _ɐ_ _ɹ_ᵻ_s_ˈ_i_ː_t_ _w_ʌ_z_ _s_ˈ_ɛ_n_t_ _t_ə_ _m_a_ɪ_ _h_ˈ_a_ʊ_s_._
*****************************************************


RTF: 1.6572626298922937
*****************************************************
Input text
-----------------------------------------------------
I mean it it's not that I'm against it it's just that I just don't have the time and I just sometimes I'm not bothered and that sort of stuff.
*****************************************************
Phonetised text - 1
-----------------------------------------------------
_ˈ_a_ɪ_ _m_ˈ_i_ː_n_ _ɪ_ɾ_ _ɪ_t_s_ _n_ˌ_ɑ_ː_t_ _ð_æ_t_ _a_ɪ_m_ _ɐ_ɡ_ˈ_ɛ_n_s_t_ _ɪ_ɾ_ _ɪ_t_s_ _d_ʒ_ˈ_ʌ_s_t_ _ð_æ_t_ _ˈ_a_ɪ_ _d_ʒ_ˈ_ʌ_s_t_ _d_ˈ_o_ʊ_n_t_ _h_æ_v_ _ð_ə_ _t_ˈ_a_ɪ_m_ _æ_n_d_ _ˈ_a_ɪ_ _d_ʒ_ˈ_ʌ_s_t_ _s_ˈ_ʌ_m_t_a_ɪ_m_z_ _a_ɪ_m_ _n_ˌ_ɑ_ː_t_ _b_ˈ_ɑ_ː_ð_ɚ_d_ _æ_n_d_ _ð_æ_t_ _s_ˈ_ɔ_ː_ɹ_t_ _ʌ_v_ _s_t_ˈ_ʌ_f_._
*****************************************************


RTF: 0.9976855819906203
*****************************************************
Input text
-----------------------------------------------------
Like every, I think most people even people who never go to mass ever will go to mass on Christmas Eve or Christmas Day. So like what we used to do is we used to go to mass on Christmas Eve which was lovely it's such a nice ceremony because it's so like it's obviously 12 o'clock at night.
*****************************************************
Phonetised text - 2
-----------------------------------------------------
_l_ˈ_a_ɪ_k_ _ˈ_ɛ_v_ɹ_i_,_ _ˈ_a_ɪ_ _θ_ˈ_ɪ_ŋ_k_ _m_ˈ_o_ʊ_s_t_ _p_ˈ_i_ː_p_ə_l_ _ˈ_i_ː_v_ə_n_ _p_ˈ_i_ː_p_ə_l_ _h_ˌ_u_ː_ _n_ˈ_ɛ_v_ɚ_ _ɡ_ˌ_o_ʊ_ _t_ə_ _m_ˈ_æ_s_ _ˈ_ɛ_v_ɚ_ _w_ɪ_l_ _ɡ_ˌ_o_ʊ_ _t_ə_ _m_ˈ_æ_s_ _ˌ_ɔ_n_ _k_ɹ_ˈ_ɪ_s_m_ə_s_ _ˈ_i_ː_v_ _ɔ_ː_ɹ_ _k_ɹ_ˈ_ɪ_s_m_ə_s_ _d_ˈ_e_ɪ_._ _s_ˌ_o_ʊ_ _l_ˈ_a_ɪ_k_ _w_ʌ_t_ _w_i_ː_ _j_ˈ_u_ː_z_d_ _t_ə_ _d_ˈ_u_ː_ _ɪ_z_ _w_i_ː_ _j_ˈ_u_ː_z_d_ _t_ə_ _ɡ_ˌ_o_ʊ_ _t_ə_ _m_ˈ_æ_s_ _ˌ_ɔ_n_ _k_ɹ_ˈ_ɪ_s_m_ə_s_ _ˈ_i_ː_v_ _w_

RTF: 1.0804372626761247
*****************************************************
Input text
-----------------------------------------------------
But, but then I ended up in hospital for like three days because they were doing loads of checks like MRIs and all that sort of stuff on my head because I kept on forgetting things and that sort of stuff and like.
*****************************************************
Phonetised text - 3
-----------------------------------------------------
_b_ˌ_ʌ_t_,_ _b_ˌ_ʌ_t_ _ð_ˈ_ɛ_n_ _ˈ_a_ɪ_ _ˈ_ɛ_n_d_ᵻ_d_ _ˌ_ʌ_p_ _ɪ_n_ _h_ˈ_ɑ_ː_s_p_ɪ_ɾ_ə_l_ _f_ɔ_ː_ɹ_ _l_ˈ_a_ɪ_k_ _θ_ɹ_ˈ_i_ː_ _d_ˈ_e_ɪ_z_ _b_ɪ_k_ˈ_ʌ_z_ _ð_e_ɪ_ _w_ɜ_ː_ _d_ˌ_u_ː_ɪ_ŋ_ _l_ˈ_o_ʊ_d_z_ _ʌ_v_ _t_ʃ_ˈ_ɛ_k_s_ _l_ˈ_a_ɪ_k_ _ˈ_ɛ_m_ɹ_ˈ_ɪ_s_ _æ_n_d_ _ˈ_ɔ_ː_l_ _ð_æ_t_ _s_ˈ_ɔ_ː_ɹ_t_ _ʌ_v_ _s_t_ˈ_ʌ_f_ _ˌ_ɔ_n_ _m_a_ɪ_ _h_ˈ_ɛ_d_ _b_ɪ_k_ˈ_ʌ_z_ _ˈ_a_ɪ_ _k_ˈ_ɛ_p_t_ _ˌ_ɔ_n_ _f_ɚ_ɡ_ˈ_ɛ_ɾ_ɪ_ŋ_ _θ_ˈ_ɪ_ŋ_z_ _æ_n_d_ _ð_æ_t_ _s_ˈ_ɔ_ː_ɹ_t_ _ʌ_v_ _s_t_ˈ_ʌ_f_ _æ_n_d_ _l_ˈ_a_ɪ_k_._
***********************************************

RTF: 1.253408192287685
*****************************************************
Input text
-----------------------------------------------------
If you like touched it, it was excruciatingly sore. And I went up to the teachers I was like look I'm after like really damaging my finger I might have to go to the doctors.
*****************************************************
Phonetised text - 4
-----------------------------------------------------
_ɪ_f_ _j_u_ː_ _l_ˈ_a_ɪ_k_ _t_ˈ_ʌ_t_ʃ_t_ _ɪ_t_,_ _ɪ_t_ _w_ʌ_z_ _ɛ_k_s_k_ɹ_ˈ_u_ː_ʃ_ɪ_ˌ_e_ɪ_ɾ_ɪ_ŋ_l_i_ _s_ˈ_o_ː_ɹ_._ _æ_n_d_ _ˈ_a_ɪ_ _w_ɛ_n_t_ _ˌ_ʌ_p_ _t_ə_ _ð_ə_ _t_ˈ_i_ː_t_ʃ_ɚ_z_ _ˈ_a_ɪ_ _w_ʌ_z_ _l_ˈ_a_ɪ_k_ _l_ˈ_ʊ_k_ _a_ɪ_m_ _ˈ_æ_f_t_ɚ_ _l_ˈ_a_ɪ_k_ _ɹ_ˈ_i_ə_l_i_ _d_ˈ_æ_m_ɪ_d_ʒ_ɪ_ŋ_ _m_a_ɪ_ _f_ˈ_ɪ_ŋ_ɡ_ɚ_ɹ_ _ˈ_a_ɪ_ _m_ˌ_a_ɪ_t_h_ɐ_v_ _t_ə_ _ɡ_ˌ_o_ʊ_ _t_ə_ _ð_ə_ _d_ˈ_ɑ_ː_k_t_ɚ_z_._
*****************************************************


[Diff-TTSG] RTF: 1.2252563994618644 +/- 0.23146757865834


## Generate stick figure visualisations
(This will take some time)

In [None]:
for i, output in enumerate(tqdm(outputs)):
    to_stick_video(i, output['bvh'], OUTPUT_FOLDER)
    combine_audio_video(i, OUTPUT_FOLDER)